C/Bcj2Enc.c - platform/external/lzma - Git at Google

 /* Bcj2Enc.c -- BCJ2 Encoder converter for x86 code (Branch CALL/JUMP variant2)
 2023-04-02 : Igor Pavlov : Public domain */

 #include "Precomp.h"

 /* #define SHOW_STAT */
 #ifdef SHOW_STAT
 #include <stdio.h>
 #define PRF2(s) printf("%s ip=%8x  tempPos=%d  src= %8x\n", s, (unsigned)p->ip64, p->tempPos, (unsigned)(p->srcLim - p->src));
 #else
 #define PRF2(s)
 #endif

 #include "Bcj2.h"
 #include "CpuArch.h"

 #define kTopValue ((UInt32)1 << 24)
 #define kNumBitModelTotalBits 11
 #define kBitModelTotal (1 << kNumBitModelTotalBits)
 #define kNumMoveBits 5

 void Bcj2Enc_Init(CBcj2Enc *p)
 {
   unsigned i;
   p->state = BCJ2_ENC_STATE_ORIG;
   p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
   p->context = 0;
   p->flushRem = 5;
   p->isFlushState = 0;
   p->cache = 0;
   p->range = 0xffffffff;
   p->low = 0;
   p->cacheSize = 1;
   p->ip64 = 0;
   p->fileIp64 = 0;
   p->fileSize64_minus1 = BCJ2_ENC_FileSizeField_UNLIMITED;
   p->relatLimit = BCJ2_ENC_RELAT_LIMIT_DEFAULT;
   // p->relatExcludeBits = 0;
   p->tempPos = 0;
   for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++)
     p->probs[i] = kBitModelTotal >> 1;
 }

 // Z7_NO_INLINE
 Z7_FORCE_INLINE
 static BoolInt Bcj2_RangeEnc_ShiftLow(CBcj2Enc *p)
 {
   const UInt32 low = (UInt32)p->low;
   const unsigned high = (unsigned)
     #if defined(Z7_MSC_VER_ORIGINAL) \
         && defined(MY_CPU_X86) \
         && defined(MY_CPU_LE) \
         && !defined(MY_CPU_64BIT)
       // we try to rid of __aullshr() call in MSVS-x86
       (((const UInt32 *)&p->low)[1]); // [1] : for little-endian only
     #else
       (p->low >> 32);
     #endif
   if (low < (UInt32)0xff000000 || high != 0)
   {
     Byte *buf = p->bufs[BCJ2_STREAM_RC];
     do
     {
       if (buf == p->lims[BCJ2_STREAM_RC])
       {
         p->state = BCJ2_STREAM_RC;
         p->bufs[BCJ2_STREAM_RC] = buf;
         return True;
       }
       *buf++ = (Byte)(p->cache + high);
       p->cache = 0xff;
     }
     while (--p->cacheSize);
     p->bufs[BCJ2_STREAM_RC] = buf;
     p->cache = (Byte)(low >> 24);
   }
   p->cacheSize++;
   p->low = low << 8;
   return False;
 }


 /*
 We can use 2 alternative versions of code:
 1) non-marker version:
   Byte CBcj2Enc::context
   Byte temp[8];
   Last byte of marker (e8/e9/[0f]8x) can be written to temp[] buffer.
   Encoder writes last byte of marker (e8/e9/[0f]8x) to dest, only in conjunction
   with writing branch symbol to range coder in same Bcj2Enc_Encode_2() call.

 2) marker version:
   UInt32 CBcj2Enc::context
   Byte CBcj2Enc::temp[4];
   MARKER_FLAG in CBcj2Enc::context shows that CBcj2Enc::context contains finded marker.
   it's allowed that
     one call of Bcj2Enc_Encode_2() writes last byte of marker (e8/e9/[0f]8x) to dest,
     and another call of Bcj2Enc_Encode_2() does offset conversion.
     So different values of (fileIp) and (fileSize) are possible
     in these different Bcj2Enc_Encode_2() calls.

 Also marker version requires additional if((v & MARKER_FLAG) == 0) check in main loop.
 So we use non-marker version.
 */

 /*
   Corner cases with overlap in multi-block.
   before v23: there was one corner case, where converted instruction
     could start in one sub-stream and finish in next sub-stream.
   If multi-block (solid) encoding is used,
     and BCJ2_ENC_FINISH_MODE_END_BLOCK is used for each sub-stream.
     and (0f) is last byte of previous sub-stream
     and (8x) is first byte of current sub-stream
   then (0f 8x) pair is treated as marker by BCJ2 encoder and decoder.
   BCJ2 encoder can converts 32-bit offset for that (0f 8x) cortage,
   if that offset meets limit requirements.
   If encoder allows 32-bit offset conversion for such overlap case,
   then the data in 3 uncompressed BCJ2 streams for some sub-stream
   can depend from data of previous sub-stream.
   That corner case is not big problem, and it's rare case.
   Since v23.00 we do additional check to prevent conversions in such overlap cases.
 */

 /*
   Bcj2Enc_Encode_2() output variables at exit:
   {
     if (Bcj2Enc_Encode_2() exits with (p->state == BCJ2_ENC_STATE_ORIG))
     {
       it means that encoder needs more input data.
       if (p->srcLim == p->src) at exit, then
       {
         (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
         all input data were read and processed, and we are ready for
         new input data.
       }
       else
       {
         (p->srcLim != p->src)
         (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
           The encoder have found e8/e9/0f_8x marker,
           and p->src points to last byte of that marker,
           Bcj2Enc_Encode_2() needs more input data to get totally
           5 bytes (last byte of marker and 32-bit branch offset)
           as continuous array starting from p->src.
         (p->srcLim - p->src < 5) requirement is met after exit.
           So non-processed resedue from p->src to p->srcLim is always less than 5 bytes.
       }
     }
   }
 */

 Z7_NO_INLINE
 static void Bcj2Enc_Encode_2(CBcj2Enc *p)
 {
   if (!p->isFlushState)
   {
     const Byte *src;
     UInt32 v;
     {
       const unsigned state = p->state;
       if (BCJ2_IS_32BIT_STREAM(state))
       {
         Byte *cur = p->bufs[state];
         if (cur == p->lims[state])
           return;
         SetBe32a(cur, p->tempTarget)
         p->bufs[state] = cur + 4;
       }
     }
     p->state = BCJ2_ENC_STATE_ORIG; // for main reason of exit
     src = p->src;
     v = p->context;

     // #define WRITE_CONTEXT  p->context = v; // for marker version
     #define WRITE_CONTEXT           p->context = (Byte)v;
     #define WRITE_CONTEXT_AND_SRC   p->src = src;  WRITE_CONTEXT

     for (;;)
     {
       // const Byte *src;
       // UInt32 v;
       CBcj2Enc_ip_unsigned ip;
       if (p->range < kTopValue)
       {
         // to reduce register pressure and code size: we save and restore local variables.
         WRITE_CONTEXT_AND_SRC
         if (Bcj2_RangeEnc_ShiftLow(p))
           return;
         p->range <<= 8;
         src = p->src;
         v = p->context;
       }
       // src = p->src;
       // #define MARKER_FLAG  ((UInt32)1 << 17)
       // if ((v & MARKER_FLAG) == 0) // for marker version
       {
         const Byte *srcLim;
         Byte *dest = p->bufs[BCJ2_STREAM_MAIN];
         {
           const SizeT remSrc = (SizeT)(p->srcLim - src);
           SizeT rem = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest);
           if (rem >= remSrc)
             rem = remSrc;
           srcLim = src + rem;
         }
         /* p->context contains context of previous byte:
            bits [0 : 7]  : src[-1], if (src) was changed in this call
            bits [8 : 31] : are undefined for non-marker version
         */
         // v = p->context;
         #define NUM_SHIFT_BITS  24
         #define CONV_FLAG  ((UInt32)1 << 16)
         #define ONE_ITER { \
           b = src[0]; \
           *dest++ = (Byte)b; \
           v = (v << NUM_SHIFT_BITS) | b; \
           if (((b + (0x100 - 0xe8)) & 0xfe) == 0) break; \
           if (((v - (((UInt32)0x0f << (NUM_SHIFT_BITS)) + 0x80)) & \
               ((((UInt32)1 << (4 + NUM_SHIFT_BITS)) - 0x1) << 4)) == 0) break; \
           src++; if (src == srcLim) { break; } }

         if (src != srcLim)
         for (;;)
         {
           /* clang can generate ineffective code with setne instead of two jcc instructions.
              we can use 2 iterations and external (unsigned b) to avoid that ineffective code genaration. */
           unsigned b;
           ONE_ITER
           ONE_ITER
         }

         ip = p->ip64 + (CBcj2Enc_ip_unsigned)(SizeT)(dest - p->bufs[BCJ2_STREAM_MAIN]);
         p->bufs[BCJ2_STREAM_MAIN] = dest;
         p->ip64 = ip;

         if (src == srcLim)
         {
           WRITE_CONTEXT_AND_SRC
           if (src != p->srcLim)
           {
             p->state = BCJ2_STREAM_MAIN;
             return;
           }
           /* (p->src == p->srcLim)
           (p->state == BCJ2_ENC_STATE_ORIG) */
           if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
             return;
           /* (p->finishMode == BCJ2_ENC_FINISH_MODE_END_STREAM */
           // (p->flushRem == 5);
           p->isFlushState = 1;
           break;
         }
         src++;
         // p->src = src;
       }
       // ip = p->ip; // for marker version
       /* marker was found */
       /* (v) contains marker that was found:
            bits [NUM_SHIFT_BITS : NUM_SHIFT_BITS + 7]
                          : value of src[-2] : xx/xx/0f
            bits [0 : 7]  : value of src[-1] : e8/e9/8x
       */
       {
         {
         #if NUM_SHIFT_BITS != 24
           v &= ~(UInt32)CONV_FLAG;
         #endif
           // UInt32 relat = 0;
           if ((SizeT)(p->srcLim - src) >= 4)
           {
             /*
             if (relat != 0 || (Byte)v != 0xe8)
             BoolInt isBigOffset = True;
             */
             const UInt32 relat = GetUi32(src);
             /*
             #define EXCLUDE_FLAG  ((UInt32)1 << 4)
             #define NEED_CONVERT(rel) ((((rel) + EXCLUDE_FLAG) & (0 - EXCLUDE_FLAG * 2)) != 0)
             if (p->relatExcludeBits != 0)
             {
               const UInt32 flag = (UInt32)1 << (p->relatExcludeBits - 1);
               isBigOffset = (((relat + flag) & (0 - flag * 2)) != 0);
             }
             // isBigOffset = False; // for debug
             */
             ip -= p->fileIp64;
             // Use the following if check, if (ip) is 64-bit:
             if (ip > (((v + 0x20) >> 5) & 1))  // 23.00 : we eliminate milti-block overlap for (Of 80) and (e8/e9)
             if ((CBcj2Enc_ip_unsigned)((CBcj2Enc_ip_signed)ip + 4 + (Int32)relat) <= p->fileSize64_minus1)
             if (((UInt32)(relat + p->relatLimit) >> 1) < p->relatLimit)
               v |= CONV_FLAG;
           }
           else if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
           {
             // (p->srcLim - src < 4)
             // /*
             // for non-marker version
             p->ip64--; // p->ip = ip - 1;
             p->bufs[BCJ2_STREAM_MAIN]--;
             src--;
             v >>= NUM_SHIFT_BITS;
             // (0 < p->srcLim - p->src <= 4)
             // */
             // v |= MARKER_FLAG; // for marker version
             /* (p->state == BCJ2_ENC_STATE_ORIG) */
             WRITE_CONTEXT_AND_SRC
             return;
           }
           {
             const unsigned c = ((v + 0x17) >> 6) & 1;
             CBcj2Prob *prob = p->probs + (unsigned)
                 (((0 - c) & (Byte)(v >> NUM_SHIFT_BITS)) + c + ((v >> 5) & 1));
             /*
                 ((Byte)v == 0xe8 ? 2 + ((Byte)(v >> 8)) :
                 ((Byte)v < 0xe8 ? 0 : 1));  // ((v >> 5) & 1));
             */
             const unsigned ttt = *prob;
             const UInt32 bound = (p->range >> kNumBitModelTotalBits) * ttt;
             if ((v & CONV_FLAG) == 0)
             {
               // static int yyy = 0; yyy++; printf("\n!needConvert = %d\n", yyy);
               // v = (Byte)v; // for marker version
               p->range = bound;
               *prob = (CBcj2Prob)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
               // WRITE_CONTEXT_AND_SRC
               continue;
             }
             p->low += bound;
             p->range -= bound;
             *prob = (CBcj2Prob)(ttt - (ttt >> kNumMoveBits));
           }
           // p->context = src[3];
           {
             // const unsigned cj = ((Byte)v == 0xe8 ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP);
             const unsigned cj = (((v + 0x57) >> 6) & 1) + BCJ2_STREAM_CALL;
             ip = p->ip64;
             v = GetUi32(src); // relat
             ip += 4;
             p->ip64 = ip;
             src += 4;
             // p->src = src;
             {
               const UInt32 absol = (UInt32)ip + v;
               Byte *cur = p->bufs[cj];
               v >>= 24;
               // WRITE_CONTEXT
               if (cur == p->lims[cj])
               {
                 p->state = cj;
                 p->tempTarget = absol;
                 WRITE_CONTEXT_AND_SRC
                 return;
               }
               SetBe32a(cur, absol)
               p->bufs[cj] = cur + 4;
             }
           }
         }
       }
     } // end of loop
   }

   for (; p->flushRem != 0; p->flushRem--)
     if (Bcj2_RangeEnc_ShiftLow(p))
       return;
   p->state = BCJ2_ENC_STATE_FINISHED;
 }


 /*
 BCJ2 encoder needs look ahead for up to 4 bytes in (src) buffer.
 So base function Bcj2Enc_Encode_2()
   in BCJ2_ENC_FINISH_MODE_CONTINUE mode can return with
   (p->state == BCJ2_ENC_STATE_ORIG && p->src < p->srcLim)
 Bcj2Enc_Encode() solves that look ahead problem by using p->temp[] buffer.
   so if (p->state == BCJ2_ENC_STATE_ORIG) after Bcj2Enc_Encode(),
     then (p->src == p->srcLim).
   And the caller's code is simpler with Bcj2Enc_Encode().
 */

 Z7_NO_INLINE
 void Bcj2Enc_Encode(CBcj2Enc *p)
 {
   PRF2("\n----")
   if (p->tempPos != 0)
   {
     /* extra: number of bytes that were copied from (src) to (temp) buffer in this call */
     unsigned extra = 0;
     /* We will touch only minimal required number of bytes in input (src) stream.
        So we will add input bytes from (src) stream to temp[] with step of 1 byte.
        We don't add new bytes to temp[] before Bcj2Enc_Encode_2() call
          in first loop iteration because
          - previous call of Bcj2Enc_Encode() could use another (finishMode),
          - previous call could finish with (p->state != BCJ2_ENC_STATE_ORIG).
        the case with full temp[] buffer (p->tempPos == 4) is possible here.
     */
     for (;;)
     {
       // (0 < p->tempPos <= 5) // in non-marker version
       /* p->src : the current src data position including extra bytes
                   that were copied to temp[] buffer in this call */
       const Byte *src = p->src;
       const Byte *srcLim = p->srcLim;
       const EBcj2Enc_FinishMode finishMode = p->finishMode;
       if (src != srcLim)
       {
         /* if there are some src data after the data copied to temp[],
            then we use MODE_CONTINUE for temp data */
         p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
       }
       p->src = p->temp;
       p->srcLim = p->temp + p->tempPos;
       PRF2("    ")
       Bcj2Enc_Encode_2(p);
       {
         const unsigned num = (unsigned)(p->src - p->temp);
         const unsigned tempPos = p->tempPos - num;
         unsigned i;
         p->tempPos = tempPos;
         for (i = 0; i < tempPos; i++)
           p->temp[i] = p->temp[(SizeT)i + num];
         // tempPos : number of bytes in temp buffer
         p->src = src;
         p->srcLim = srcLim;
         p->finishMode = finishMode;
         if (p->state != BCJ2_ENC_STATE_ORIG)
         {
           // (p->tempPos <= 4) // in non-marker version
           /* if (the reason of exit from Bcj2Enc_Encode_2()
                  is not BCJ2_ENC_STATE_ORIG),
              then we exit from Bcj2Enc_Encode() with same reason */
           // optional code begin : we rollback (src) and tempPos, if it's possible:
           if (extra >= tempPos)
             extra = tempPos;
           p->src = src - extra;
           p->tempPos = tempPos - extra;
           // optional code end : rollback of (src) and tempPos
           return;
         }
         /* (p->tempPos <= 4)
            (p->state == BCJ2_ENC_STATE_ORIG)
              so encoder needs more data than in temp[] */
         if (src == srcLim)
           return; // src buffer has no more input data.
         /* (src != srcLim)
            so we can provide more input data from src for Bcj2Enc_Encode_2() */
         if (extra >= tempPos)
         {
           /* (extra >= tempPos) means that temp buffer contains
              only data from src buffer of this call.
              So now we can encode without temp buffer */
           p->src = src - tempPos; // rollback (src)
           p->tempPos = 0;
           break;
         }
         // we append one additional extra byte from (src) to temp[] buffer:
         p->temp[tempPos] = *src;
         p->tempPos = tempPos + 1;
         // (0 < p->tempPos <= 5) // in non-marker version
         p->src = src + 1;
         extra++;
       }
     }
   }

   PRF2("++++")
   // (p->tempPos == 0)
   Bcj2Enc_Encode_2(p);
   PRF2("====")

   if (p->state == BCJ2_ENC_STATE_ORIG)
   {
     const Byte *src = p->src;
     const Byte *srcLim = p->srcLim;
     const unsigned rem = (unsigned)(srcLim - src);
     /* (rem <= 4) here.
        if (p->src != p->srcLim), then
          - we copy non-processed bytes from (p->src) to temp[] buffer,
          - we set p->src equal to p->srcLim.
     */
     if (rem)
     {
       unsigned i = 0;
       p->src = srcLim;
       p->tempPos = rem;
       // (0 < p->tempPos <= 4)
       do
         p->temp[i] = src[i];
       while (++i != rem);
     }
     // (p->tempPos <= 4)
     // (p->src == p->srcLim)
   }
 }

 #undef PRF2
 #undef CONV_FLAG
 #undef MARKER_FLAG
 #undef WRITE_CONTEXT
 #undef WRITE_CONTEXT_AND_SRC
 #undef ONE_ITER
 #undef NUM_SHIFT_BITS
 #undef kTopValue
 #undef kNumBitModelTotalBits
 #undef kBitModelTotal
 #undef kNumMoveBits
	/* Bcj2Enc.c -- BCJ2 Encoder converter for x86 code (Branch CALL/JUMP variant2)
	2023-04-02 : Igor Pavlov : Public domain */

	#include "Precomp.h"

	/* #define SHOW_STAT */
	#ifdef SHOW_STAT
	#include <stdio.h>
	#define PRF2(s) printf("%s ip=%8x tempPos=%d src= %8x\n", s, (unsigned)p->ip64, p->tempPos, (unsigned)(p->srcLim - p->src));
	#else
	#define PRF2(s)
	#endif

	#include "Bcj2.h"
	#include "CpuArch.h"

	#define kTopValue ((UInt32)1 << 24)
	#define kNumBitModelTotalBits 11
	#define kBitModelTotal (1 << kNumBitModelTotalBits)
	#define kNumMoveBits 5

	void Bcj2Enc_Init(CBcj2Enc *p)
	{
	unsigned i;
	p->state = BCJ2_ENC_STATE_ORIG;
	p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
	p->context = 0;
	p->flushRem = 5;
	p->isFlushState = 0;
	p->cache = 0;
	p->range = 0xffffffff;
	p->low = 0;
	p->cacheSize = 1;
	p->ip64 = 0;
	p->fileIp64 = 0;
	p->fileSize64_minus1 = BCJ2_ENC_FileSizeField_UNLIMITED;
	p->relatLimit = BCJ2_ENC_RELAT_LIMIT_DEFAULT;
	// p->relatExcludeBits = 0;
	p->tempPos = 0;
	for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++)
	p->probs[i] = kBitModelTotal >> 1;
	}

	// Z7_NO_INLINE
	Z7_FORCE_INLINE
	static BoolInt Bcj2_RangeEnc_ShiftLow(CBcj2Enc *p)
	{
	const UInt32 low = (UInt32)p->low;
	const unsigned high = (unsigned)
	#if defined(Z7_MSC_VER_ORIGINAL) \
	&& defined(MY_CPU_X86) \
	&& defined(MY_CPU_LE) \
	&& !defined(MY_CPU_64BIT)
	// we try to rid of __aullshr() call in MSVS-x86
	(((const UInt32 *)&p->low)[1]); // [1] : for little-endian only
	#else
	(p->low >> 32);
	#endif
	if (low < (UInt32)0xff000000 \|\| high != 0)
	{
	Byte *buf = p->bufs[BCJ2_STREAM_RC];
	do
	{
	if (buf == p->lims[BCJ2_STREAM_RC])
	{
	p->state = BCJ2_STREAM_RC;
	p->bufs[BCJ2_STREAM_RC] = buf;
	return True;
	}
	*buf++ = (Byte)(p->cache + high);
	p->cache = 0xff;
	}
	while (--p->cacheSize);
	p->bufs[BCJ2_STREAM_RC] = buf;
	p->cache = (Byte)(low >> 24);
	}
	p->cacheSize++;
	p->low = low << 8;
	return False;
	}


	/*
	We can use 2 alternative versions of code:
	1) non-marker version:
	Byte CBcj2Enc::context
	Byte temp[8];
	Last byte of marker (e8/e9/[0f]8x) can be written to temp[] buffer.
	Encoder writes last byte of marker (e8/e9/[0f]8x) to dest, only in conjunction
	with writing branch symbol to range coder in same Bcj2Enc_Encode_2() call.

	2) marker version:
	UInt32 CBcj2Enc::context
	Byte CBcj2Enc::temp[4];
	MARKER_FLAG in CBcj2Enc::context shows that CBcj2Enc::context contains finded marker.
	it's allowed that
	one call of Bcj2Enc_Encode_2() writes last byte of marker (e8/e9/[0f]8x) to dest,
	and another call of Bcj2Enc_Encode_2() does offset conversion.
	So different values of (fileIp) and (fileSize) are possible
	in these different Bcj2Enc_Encode_2() calls.

	Also marker version requires additional if((v & MARKER_FLAG) == 0) check in main loop.
	So we use non-marker version.
	*/

	/*
	Corner cases with overlap in multi-block.
	before v23: there was one corner case, where converted instruction
	could start in one sub-stream and finish in next sub-stream.
	If multi-block (solid) encoding is used,
	and BCJ2_ENC_FINISH_MODE_END_BLOCK is used for each sub-stream.
	and (0f) is last byte of previous sub-stream
	and (8x) is first byte of current sub-stream
	then (0f 8x) pair is treated as marker by BCJ2 encoder and decoder.
	BCJ2 encoder can converts 32-bit offset for that (0f 8x) cortage,
	if that offset meets limit requirements.
	If encoder allows 32-bit offset conversion for such overlap case,
	then the data in 3 uncompressed BCJ2 streams for some sub-stream
	can depend from data of previous sub-stream.
	That corner case is not big problem, and it's rare case.
	Since v23.00 we do additional check to prevent conversions in such overlap cases.
	*/

	/*
	Bcj2Enc_Encode_2() output variables at exit:
	{
	if (Bcj2Enc_Encode_2() exits with (p->state == BCJ2_ENC_STATE_ORIG))
	{
	it means that encoder needs more input data.
	if (p->srcLim == p->src) at exit, then
	{
	(p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
	all input data were read and processed, and we are ready for
	new input data.
	}
	else
	{
	(p->srcLim != p->src)
	(p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
	The encoder have found e8/e9/0f_8x marker,
	and p->src points to last byte of that marker,
	Bcj2Enc_Encode_2() needs more input data to get totally
	5 bytes (last byte of marker and 32-bit branch offset)
	as continuous array starting from p->src.
	(p->srcLim - p->src < 5) requirement is met after exit.
	So non-processed resedue from p->src to p->srcLim is always less than 5 bytes.
	}
	}
	}
	*/

	Z7_NO_INLINE
	static void Bcj2Enc_Encode_2(CBcj2Enc *p)
	{
	if (!p->isFlushState)
	{
	const Byte *src;
	UInt32 v;
	{
	const unsigned state = p->state;
	if (BCJ2_IS_32BIT_STREAM(state))
	{
	Byte *cur = p->bufs[state];
	if (cur == p->lims[state])
	return;
	SetBe32a(cur, p->tempTarget)
	p->bufs[state] = cur + 4;
	}
	}
	p->state = BCJ2_ENC_STATE_ORIG; // for main reason of exit
	src = p->src;
	v = p->context;

	// #define WRITE_CONTEXT p->context = v; // for marker version
	#define WRITE_CONTEXT p->context = (Byte)v;
	#define WRITE_CONTEXT_AND_SRC p->src = src; WRITE_CONTEXT

	for (;;)
	{
	// const Byte *src;
	// UInt32 v;
	CBcj2Enc_ip_unsigned ip;
	if (p->range < kTopValue)
	{
	// to reduce register pressure and code size: we save and restore local variables.
	WRITE_CONTEXT_AND_SRC
	if (Bcj2_RangeEnc_ShiftLow(p))
	return;
	p->range <<= 8;
	src = p->src;
	v = p->context;
	}
	// src = p->src;
	// #define MARKER_FLAG ((UInt32)1 << 17)
	// if ((v & MARKER_FLAG) == 0) // for marker version
	{
	const Byte *srcLim;
	Byte *dest = p->bufs[BCJ2_STREAM_MAIN];
	{
	const SizeT remSrc = (SizeT)(p->srcLim - src);
	SizeT rem = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest);
	if (rem >= remSrc)
	rem = remSrc;
	srcLim = src + rem;
	}
	/* p->context contains context of previous byte:
	bits [0 : 7] : src[-1], if (src) was changed in this call
	bits [8 : 31] : are undefined for non-marker version
	*/
	// v = p->context;
	#define NUM_SHIFT_BITS 24
	#define CONV_FLAG ((UInt32)1 << 16)
	#define ONE_ITER { \
	b = src[0]; \
	*dest++ = (Byte)b; \
	v = (v << NUM_SHIFT_BITS) \| b; \
	if (((b + (0x100 - 0xe8)) & 0xfe) == 0) break; \
	if (((v - (((UInt32)0x0f << (NUM_SHIFT_BITS)) + 0x80)) & \
	((((UInt32)1 << (4 + NUM_SHIFT_BITS)) - 0x1) << 4)) == 0) break; \
	src++; if (src == srcLim) { break; } }

	if (src != srcLim)
	for (;;)
	{
	/* clang can generate ineffective code with setne instead of two jcc instructions.
	we can use 2 iterations and external (unsigned b) to avoid that ineffective code genaration. */
	unsigned b;
	ONE_ITER
	ONE_ITER
	}

	ip = p->ip64 + (CBcj2Enc_ip_unsigned)(SizeT)(dest - p->bufs[BCJ2_STREAM_MAIN]);
	p->bufs[BCJ2_STREAM_MAIN] = dest;
	p->ip64 = ip;

	if (src == srcLim)
	{
	WRITE_CONTEXT_AND_SRC
	if (src != p->srcLim)
	{
	p->state = BCJ2_STREAM_MAIN;
	return;
	}
	/* (p->src == p->srcLim)
	(p->state == BCJ2_ENC_STATE_ORIG) */
	if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM)
	return;
	/* (p->finishMode == BCJ2_ENC_FINISH_MODE_END_STREAM */
	// (p->flushRem == 5);
	p->isFlushState = 1;
	break;
	}
	src++;
	// p->src = src;
	}
	// ip = p->ip; // for marker version
	/* marker was found */
	/* (v) contains marker that was found:
	bits [NUM_SHIFT_BITS : NUM_SHIFT_BITS + 7]
	: value of src[-2] : xx/xx/0f
	bits [0 : 7] : value of src[-1] : e8/e9/8x
	*/
	{
	{
	#if NUM_SHIFT_BITS != 24
	v &= ~(UInt32)CONV_FLAG;
	#endif
	// UInt32 relat = 0;
	if ((SizeT)(p->srcLim - src) >= 4)
	{
	/*
	if (relat != 0 \|\| (Byte)v != 0xe8)
	BoolInt isBigOffset = True;
	*/
	const UInt32 relat = GetUi32(src);
	/*
	#define EXCLUDE_FLAG ((UInt32)1 << 4)
	#define NEED_CONVERT(rel) ((((rel) + EXCLUDE_FLAG) & (0 - EXCLUDE_FLAG * 2)) != 0)
	if (p->relatExcludeBits != 0)
	{
	const UInt32 flag = (UInt32)1 << (p->relatExcludeBits - 1);
	isBigOffset = (((relat + flag) & (0 - flag * 2)) != 0);
	}
	// isBigOffset = False; // for debug
	*/
	ip -= p->fileIp64;
	// Use the following if check, if (ip) is 64-bit:
	if (ip > (((v + 0x20) >> 5) & 1)) // 23.00 : we eliminate milti-block overlap for (Of 80) and (e8/e9)
	if ((CBcj2Enc_ip_unsigned)((CBcj2Enc_ip_signed)ip + 4 + (Int32)relat) <= p->fileSize64_minus1)
	if (((UInt32)(relat + p->relatLimit) >> 1) < p->relatLimit)
	v \|= CONV_FLAG;
	}
	else if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE)
	{
	// (p->srcLim - src < 4)
	// /*
	// for non-marker version
	p->ip64--; // p->ip = ip - 1;
	p->bufs[BCJ2_STREAM_MAIN]--;
	src--;
	v >>= NUM_SHIFT_BITS;
	// (0 < p->srcLim - p->src <= 4)
	// */
	// v \|= MARKER_FLAG; // for marker version
	/* (p->state == BCJ2_ENC_STATE_ORIG) */
	WRITE_CONTEXT_AND_SRC
	return;
	}
	{
	const unsigned c = ((v + 0x17) >> 6) & 1;
	CBcj2Prob *prob = p->probs + (unsigned)
	(((0 - c) & (Byte)(v >> NUM_SHIFT_BITS)) + c + ((v >> 5) & 1));
	/*
	((Byte)v == 0xe8 ? 2 + ((Byte)(v >> 8)) :
	((Byte)v < 0xe8 ? 0 : 1)); // ((v >> 5) & 1));
	*/
	const unsigned ttt = *prob;
	const UInt32 bound = (p->range >> kNumBitModelTotalBits) * ttt;
	if ((v & CONV_FLAG) == 0)
	{
	// static int yyy = 0; yyy++; printf("\n!needConvert = %d\n", yyy);
	// v = (Byte)v; // for marker version
	p->range = bound;
	*prob = (CBcj2Prob)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits));
	// WRITE_CONTEXT_AND_SRC
	continue;
	}
	p->low += bound;
	p->range -= bound;
	*prob = (CBcj2Prob)(ttt - (ttt >> kNumMoveBits));
	}
	// p->context = src[3];
	{
	// const unsigned cj = ((Byte)v == 0xe8 ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP);
	const unsigned cj = (((v + 0x57) >> 6) & 1) + BCJ2_STREAM_CALL;
	ip = p->ip64;
	v = GetUi32(src); // relat
	ip += 4;
	p->ip64 = ip;
	src += 4;
	// p->src = src;
	{
	const UInt32 absol = (UInt32)ip + v;
	Byte *cur = p->bufs[cj];
	v >>= 24;
	// WRITE_CONTEXT
	if (cur == p->lims[cj])
	{
	p->state = cj;
	p->tempTarget = absol;
	WRITE_CONTEXT_AND_SRC
	return;
	}
	SetBe32a(cur, absol)
	p->bufs[cj] = cur + 4;
	}
	}
	}
	}
	} // end of loop
	}

	for (; p->flushRem != 0; p->flushRem--)
	if (Bcj2_RangeEnc_ShiftLow(p))
	return;
	p->state = BCJ2_ENC_STATE_FINISHED;
	}


	/*
	BCJ2 encoder needs look ahead for up to 4 bytes in (src) buffer.
	So base function Bcj2Enc_Encode_2()
	in BCJ2_ENC_FINISH_MODE_CONTINUE mode can return with
	(p->state == BCJ2_ENC_STATE_ORIG && p->src < p->srcLim)
	Bcj2Enc_Encode() solves that look ahead problem by using p->temp[] buffer.
	so if (p->state == BCJ2_ENC_STATE_ORIG) after Bcj2Enc_Encode(),
	then (p->src == p->srcLim).
	And the caller's code is simpler with Bcj2Enc_Encode().
	*/

	Z7_NO_INLINE
	void Bcj2Enc_Encode(CBcj2Enc *p)
	{
	PRF2("\n----")
	if (p->tempPos != 0)
	{
	/* extra: number of bytes that were copied from (src) to (temp) buffer in this call */
	unsigned extra = 0;
	/* We will touch only minimal required number of bytes in input (src) stream.
	So we will add input bytes from (src) stream to temp[] with step of 1 byte.
	We don't add new bytes to temp[] before Bcj2Enc_Encode_2() call
	in first loop iteration because
	- previous call of Bcj2Enc_Encode() could use another (finishMode),
	- previous call could finish with (p->state != BCJ2_ENC_STATE_ORIG).
	the case with full temp[] buffer (p->tempPos == 4) is possible here.
	*/
	for (;;)
	{
	// (0 < p->tempPos <= 5) // in non-marker version
	/* p->src : the current src data position including extra bytes
	that were copied to temp[] buffer in this call */
	const Byte *src = p->src;
	const Byte *srcLim = p->srcLim;
	const EBcj2Enc_FinishMode finishMode = p->finishMode;
	if (src != srcLim)
	{
	/* if there are some src data after the data copied to temp[],
	then we use MODE_CONTINUE for temp data */
	p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE;
	}
	p->src = p->temp;
	p->srcLim = p->temp + p->tempPos;
	PRF2(" ")
	Bcj2Enc_Encode_2(p);
	{
	const unsigned num = (unsigned)(p->src - p->temp);
	const unsigned tempPos = p->tempPos - num;
	unsigned i;
	p->tempPos = tempPos;
	for (i = 0; i < tempPos; i++)
	p->temp[i] = p->temp[(SizeT)i + num];
	// tempPos : number of bytes in temp buffer
	p->src = src;
	p->srcLim = srcLim;
	p->finishMode = finishMode;
	if (p->state != BCJ2_ENC_STATE_ORIG)
	{
	// (p->tempPos <= 4) // in non-marker version
	/* if (the reason of exit from Bcj2Enc_Encode_2()
	is not BCJ2_ENC_STATE_ORIG),
	then we exit from Bcj2Enc_Encode() with same reason */
	// optional code begin : we rollback (src) and tempPos, if it's possible:
	if (extra >= tempPos)
	extra = tempPos;
	p->src = src - extra;
	p->tempPos = tempPos - extra;
	// optional code end : rollback of (src) and tempPos
	return;
	}
	/* (p->tempPos <= 4)
	(p->state == BCJ2_ENC_STATE_ORIG)
	so encoder needs more data than in temp[] */
	if (src == srcLim)
	return; // src buffer has no more input data.
	/* (src != srcLim)
	so we can provide more input data from src for Bcj2Enc_Encode_2() */
	if (extra >= tempPos)
	{
	/* (extra >= tempPos) means that temp buffer contains
	only data from src buffer of this call.
	So now we can encode without temp buffer */
	p->src = src - tempPos; // rollback (src)
	p->tempPos = 0;
	break;
	}
	// we append one additional extra byte from (src) to temp[] buffer:
	p->temp[tempPos] = *src;
	p->tempPos = tempPos + 1;
	// (0 < p->tempPos <= 5) // in non-marker version
	p->src = src + 1;
	extra++;
	}
	}
	}

	PRF2("++++")
	// (p->tempPos == 0)
	Bcj2Enc_Encode_2(p);
	PRF2("====")

	if (p->state == BCJ2_ENC_STATE_ORIG)
	{
	const Byte *src = p->src;
	const Byte *srcLim = p->srcLim;
	const unsigned rem = (unsigned)(srcLim - src);
	/* (rem <= 4) here.
	if (p->src != p->srcLim), then
	- we copy non-processed bytes from (p->src) to temp[] buffer,
	- we set p->src equal to p->srcLim.
	*/
	if (rem)
	{
	unsigned i = 0;
	p->src = srcLim;
	p->tempPos = rem;
	// (0 < p->tempPos <= 4)
	do
	p->temp[i] = src[i];
	while (++i != rem);
	}
	// (p->tempPos <= 4)
	// (p->src == p->srcLim)
	}
	}

	#undef PRF2
	#undef CONV_FLAG
	#undef MARKER_FLAG
	#undef WRITE_CONTEXT
	#undef WRITE_CONTEXT_AND_SRC
	#undef ONE_ITER
	#undef NUM_SHIFT_BITS
	#undef kTopValue
	#undef kNumBitModelTotalBits
	#undef kBitModelTotal
	#undef kNumMoveBits