| /* Bcj2Enc.c -- BCJ2 Encoder converter for x86 code (Branch CALL/JUMP variant2) |
| 2023-04-02 : Igor Pavlov : Public domain */ |
| |
| #include "Precomp.h" |
| |
| /* #define SHOW_STAT */ |
| #ifdef SHOW_STAT |
| #include <stdio.h> |
| #define PRF2(s) printf("%s ip=%8x tempPos=%d src= %8x\n", s, (unsigned)p->ip64, p->tempPos, (unsigned)(p->srcLim - p->src)); |
| #else |
| #define PRF2(s) |
| #endif |
| |
| #include "Bcj2.h" |
| #include "CpuArch.h" |
| |
| #define kTopValue ((UInt32)1 << 24) |
| #define kNumBitModelTotalBits 11 |
| #define kBitModelTotal (1 << kNumBitModelTotalBits) |
| #define kNumMoveBits 5 |
| |
| void Bcj2Enc_Init(CBcj2Enc *p) |
| { |
| unsigned i; |
| p->state = BCJ2_ENC_STATE_ORIG; |
| p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE; |
| p->context = 0; |
| p->flushRem = 5; |
| p->isFlushState = 0; |
| p->cache = 0; |
| p->range = 0xffffffff; |
| p->low = 0; |
| p->cacheSize = 1; |
| p->ip64 = 0; |
| p->fileIp64 = 0; |
| p->fileSize64_minus1 = BCJ2_ENC_FileSizeField_UNLIMITED; |
| p->relatLimit = BCJ2_ENC_RELAT_LIMIT_DEFAULT; |
| // p->relatExcludeBits = 0; |
| p->tempPos = 0; |
| for (i = 0; i < sizeof(p->probs) / sizeof(p->probs[0]); i++) |
| p->probs[i] = kBitModelTotal >> 1; |
| } |
| |
| // Z7_NO_INLINE |
| Z7_FORCE_INLINE |
| static BoolInt Bcj2_RangeEnc_ShiftLow(CBcj2Enc *p) |
| { |
| const UInt32 low = (UInt32)p->low; |
| const unsigned high = (unsigned) |
| #if defined(Z7_MSC_VER_ORIGINAL) \ |
| && defined(MY_CPU_X86) \ |
| && defined(MY_CPU_LE) \ |
| && !defined(MY_CPU_64BIT) |
| // we try to rid of __aullshr() call in MSVS-x86 |
| (((const UInt32 *)&p->low)[1]); // [1] : for little-endian only |
| #else |
| (p->low >> 32); |
| #endif |
| if (low < (UInt32)0xff000000 || high != 0) |
| { |
| Byte *buf = p->bufs[BCJ2_STREAM_RC]; |
| do |
| { |
| if (buf == p->lims[BCJ2_STREAM_RC]) |
| { |
| p->state = BCJ2_STREAM_RC; |
| p->bufs[BCJ2_STREAM_RC] = buf; |
| return True; |
| } |
| *buf++ = (Byte)(p->cache + high); |
| p->cache = 0xff; |
| } |
| while (--p->cacheSize); |
| p->bufs[BCJ2_STREAM_RC] = buf; |
| p->cache = (Byte)(low >> 24); |
| } |
| p->cacheSize++; |
| p->low = low << 8; |
| return False; |
| } |
| |
| |
| /* |
| We can use 2 alternative versions of code: |
| 1) non-marker version: |
| Byte CBcj2Enc::context |
| Byte temp[8]; |
| Last byte of marker (e8/e9/[0f]8x) can be written to temp[] buffer. |
| Encoder writes last byte of marker (e8/e9/[0f]8x) to dest, only in conjunction |
| with writing branch symbol to range coder in same Bcj2Enc_Encode_2() call. |
| |
| 2) marker version: |
| UInt32 CBcj2Enc::context |
| Byte CBcj2Enc::temp[4]; |
| MARKER_FLAG in CBcj2Enc::context shows that CBcj2Enc::context contains finded marker. |
| it's allowed that |
| one call of Bcj2Enc_Encode_2() writes last byte of marker (e8/e9/[0f]8x) to dest, |
| and another call of Bcj2Enc_Encode_2() does offset conversion. |
| So different values of (fileIp) and (fileSize) are possible |
| in these different Bcj2Enc_Encode_2() calls. |
| |
| Also marker version requires additional if((v & MARKER_FLAG) == 0) check in main loop. |
| So we use non-marker version. |
| */ |
| |
| /* |
| Corner cases with overlap in multi-block. |
| before v23: there was one corner case, where converted instruction |
| could start in one sub-stream and finish in next sub-stream. |
| If multi-block (solid) encoding is used, |
| and BCJ2_ENC_FINISH_MODE_END_BLOCK is used for each sub-stream. |
| and (0f) is last byte of previous sub-stream |
| and (8x) is first byte of current sub-stream |
| then (0f 8x) pair is treated as marker by BCJ2 encoder and decoder. |
| BCJ2 encoder can converts 32-bit offset for that (0f 8x) cortage, |
| if that offset meets limit requirements. |
| If encoder allows 32-bit offset conversion for such overlap case, |
| then the data in 3 uncompressed BCJ2 streams for some sub-stream |
| can depend from data of previous sub-stream. |
| That corner case is not big problem, and it's rare case. |
| Since v23.00 we do additional check to prevent conversions in such overlap cases. |
| */ |
| |
| /* |
| Bcj2Enc_Encode_2() output variables at exit: |
| { |
| if (Bcj2Enc_Encode_2() exits with (p->state == BCJ2_ENC_STATE_ORIG)) |
| { |
| it means that encoder needs more input data. |
| if (p->srcLim == p->src) at exit, then |
| { |
| (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM) |
| all input data were read and processed, and we are ready for |
| new input data. |
| } |
| else |
| { |
| (p->srcLim != p->src) |
| (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE) |
| The encoder have found e8/e9/0f_8x marker, |
| and p->src points to last byte of that marker, |
| Bcj2Enc_Encode_2() needs more input data to get totally |
| 5 bytes (last byte of marker and 32-bit branch offset) |
| as continuous array starting from p->src. |
| (p->srcLim - p->src < 5) requirement is met after exit. |
| So non-processed resedue from p->src to p->srcLim is always less than 5 bytes. |
| } |
| } |
| } |
| */ |
| |
| Z7_NO_INLINE |
| static void Bcj2Enc_Encode_2(CBcj2Enc *p) |
| { |
| if (!p->isFlushState) |
| { |
| const Byte *src; |
| UInt32 v; |
| { |
| const unsigned state = p->state; |
| if (BCJ2_IS_32BIT_STREAM(state)) |
| { |
| Byte *cur = p->bufs[state]; |
| if (cur == p->lims[state]) |
| return; |
| SetBe32a(cur, p->tempTarget) |
| p->bufs[state] = cur + 4; |
| } |
| } |
| p->state = BCJ2_ENC_STATE_ORIG; // for main reason of exit |
| src = p->src; |
| v = p->context; |
| |
| // #define WRITE_CONTEXT p->context = v; // for marker version |
| #define WRITE_CONTEXT p->context = (Byte)v; |
| #define WRITE_CONTEXT_AND_SRC p->src = src; WRITE_CONTEXT |
| |
| for (;;) |
| { |
| // const Byte *src; |
| // UInt32 v; |
| CBcj2Enc_ip_unsigned ip; |
| if (p->range < kTopValue) |
| { |
| // to reduce register pressure and code size: we save and restore local variables. |
| WRITE_CONTEXT_AND_SRC |
| if (Bcj2_RangeEnc_ShiftLow(p)) |
| return; |
| p->range <<= 8; |
| src = p->src; |
| v = p->context; |
| } |
| // src = p->src; |
| // #define MARKER_FLAG ((UInt32)1 << 17) |
| // if ((v & MARKER_FLAG) == 0) // for marker version |
| { |
| const Byte *srcLim; |
| Byte *dest = p->bufs[BCJ2_STREAM_MAIN]; |
| { |
| const SizeT remSrc = (SizeT)(p->srcLim - src); |
| SizeT rem = (SizeT)(p->lims[BCJ2_STREAM_MAIN] - dest); |
| if (rem >= remSrc) |
| rem = remSrc; |
| srcLim = src + rem; |
| } |
| /* p->context contains context of previous byte: |
| bits [0 : 7] : src[-1], if (src) was changed in this call |
| bits [8 : 31] : are undefined for non-marker version |
| */ |
| // v = p->context; |
| #define NUM_SHIFT_BITS 24 |
| #define CONV_FLAG ((UInt32)1 << 16) |
| #define ONE_ITER { \ |
| b = src[0]; \ |
| *dest++ = (Byte)b; \ |
| v = (v << NUM_SHIFT_BITS) | b; \ |
| if (((b + (0x100 - 0xe8)) & 0xfe) == 0) break; \ |
| if (((v - (((UInt32)0x0f << (NUM_SHIFT_BITS)) + 0x80)) & \ |
| ((((UInt32)1 << (4 + NUM_SHIFT_BITS)) - 0x1) << 4)) == 0) break; \ |
| src++; if (src == srcLim) { break; } } |
| |
| if (src != srcLim) |
| for (;;) |
| { |
| /* clang can generate ineffective code with setne instead of two jcc instructions. |
| we can use 2 iterations and external (unsigned b) to avoid that ineffective code genaration. */ |
| unsigned b; |
| ONE_ITER |
| ONE_ITER |
| } |
| |
| ip = p->ip64 + (CBcj2Enc_ip_unsigned)(SizeT)(dest - p->bufs[BCJ2_STREAM_MAIN]); |
| p->bufs[BCJ2_STREAM_MAIN] = dest; |
| p->ip64 = ip; |
| |
| if (src == srcLim) |
| { |
| WRITE_CONTEXT_AND_SRC |
| if (src != p->srcLim) |
| { |
| p->state = BCJ2_STREAM_MAIN; |
| return; |
| } |
| /* (p->src == p->srcLim) |
| (p->state == BCJ2_ENC_STATE_ORIG) */ |
| if (p->finishMode != BCJ2_ENC_FINISH_MODE_END_STREAM) |
| return; |
| /* (p->finishMode == BCJ2_ENC_FINISH_MODE_END_STREAM */ |
| // (p->flushRem == 5); |
| p->isFlushState = 1; |
| break; |
| } |
| src++; |
| // p->src = src; |
| } |
| // ip = p->ip; // for marker version |
| /* marker was found */ |
| /* (v) contains marker that was found: |
| bits [NUM_SHIFT_BITS : NUM_SHIFT_BITS + 7] |
| : value of src[-2] : xx/xx/0f |
| bits [0 : 7] : value of src[-1] : e8/e9/8x |
| */ |
| { |
| { |
| #if NUM_SHIFT_BITS != 24 |
| v &= ~(UInt32)CONV_FLAG; |
| #endif |
| // UInt32 relat = 0; |
| if ((SizeT)(p->srcLim - src) >= 4) |
| { |
| /* |
| if (relat != 0 || (Byte)v != 0xe8) |
| BoolInt isBigOffset = True; |
| */ |
| const UInt32 relat = GetUi32(src); |
| /* |
| #define EXCLUDE_FLAG ((UInt32)1 << 4) |
| #define NEED_CONVERT(rel) ((((rel) + EXCLUDE_FLAG) & (0 - EXCLUDE_FLAG * 2)) != 0) |
| if (p->relatExcludeBits != 0) |
| { |
| const UInt32 flag = (UInt32)1 << (p->relatExcludeBits - 1); |
| isBigOffset = (((relat + flag) & (0 - flag * 2)) != 0); |
| } |
| // isBigOffset = False; // for debug |
| */ |
| ip -= p->fileIp64; |
| // Use the following if check, if (ip) is 64-bit: |
| if (ip > (((v + 0x20) >> 5) & 1)) // 23.00 : we eliminate milti-block overlap for (Of 80) and (e8/e9) |
| if ((CBcj2Enc_ip_unsigned)((CBcj2Enc_ip_signed)ip + 4 + (Int32)relat) <= p->fileSize64_minus1) |
| if (((UInt32)(relat + p->relatLimit) >> 1) < p->relatLimit) |
| v |= CONV_FLAG; |
| } |
| else if (p->finishMode == BCJ2_ENC_FINISH_MODE_CONTINUE) |
| { |
| // (p->srcLim - src < 4) |
| // /* |
| // for non-marker version |
| p->ip64--; // p->ip = ip - 1; |
| p->bufs[BCJ2_STREAM_MAIN]--; |
| src--; |
| v >>= NUM_SHIFT_BITS; |
| // (0 < p->srcLim - p->src <= 4) |
| // */ |
| // v |= MARKER_FLAG; // for marker version |
| /* (p->state == BCJ2_ENC_STATE_ORIG) */ |
| WRITE_CONTEXT_AND_SRC |
| return; |
| } |
| { |
| const unsigned c = ((v + 0x17) >> 6) & 1; |
| CBcj2Prob *prob = p->probs + (unsigned) |
| (((0 - c) & (Byte)(v >> NUM_SHIFT_BITS)) + c + ((v >> 5) & 1)); |
| /* |
| ((Byte)v == 0xe8 ? 2 + ((Byte)(v >> 8)) : |
| ((Byte)v < 0xe8 ? 0 : 1)); // ((v >> 5) & 1)); |
| */ |
| const unsigned ttt = *prob; |
| const UInt32 bound = (p->range >> kNumBitModelTotalBits) * ttt; |
| if ((v & CONV_FLAG) == 0) |
| { |
| // static int yyy = 0; yyy++; printf("\n!needConvert = %d\n", yyy); |
| // v = (Byte)v; // for marker version |
| p->range = bound; |
| *prob = (CBcj2Prob)(ttt + ((kBitModelTotal - ttt) >> kNumMoveBits)); |
| // WRITE_CONTEXT_AND_SRC |
| continue; |
| } |
| p->low += bound; |
| p->range -= bound; |
| *prob = (CBcj2Prob)(ttt - (ttt >> kNumMoveBits)); |
| } |
| // p->context = src[3]; |
| { |
| // const unsigned cj = ((Byte)v == 0xe8 ? BCJ2_STREAM_CALL : BCJ2_STREAM_JUMP); |
| const unsigned cj = (((v + 0x57) >> 6) & 1) + BCJ2_STREAM_CALL; |
| ip = p->ip64; |
| v = GetUi32(src); // relat |
| ip += 4; |
| p->ip64 = ip; |
| src += 4; |
| // p->src = src; |
| { |
| const UInt32 absol = (UInt32)ip + v; |
| Byte *cur = p->bufs[cj]; |
| v >>= 24; |
| // WRITE_CONTEXT |
| if (cur == p->lims[cj]) |
| { |
| p->state = cj; |
| p->tempTarget = absol; |
| WRITE_CONTEXT_AND_SRC |
| return; |
| } |
| SetBe32a(cur, absol) |
| p->bufs[cj] = cur + 4; |
| } |
| } |
| } |
| } |
| } // end of loop |
| } |
| |
| for (; p->flushRem != 0; p->flushRem--) |
| if (Bcj2_RangeEnc_ShiftLow(p)) |
| return; |
| p->state = BCJ2_ENC_STATE_FINISHED; |
| } |
| |
| |
| /* |
| BCJ2 encoder needs look ahead for up to 4 bytes in (src) buffer. |
| So base function Bcj2Enc_Encode_2() |
| in BCJ2_ENC_FINISH_MODE_CONTINUE mode can return with |
| (p->state == BCJ2_ENC_STATE_ORIG && p->src < p->srcLim) |
| Bcj2Enc_Encode() solves that look ahead problem by using p->temp[] buffer. |
| so if (p->state == BCJ2_ENC_STATE_ORIG) after Bcj2Enc_Encode(), |
| then (p->src == p->srcLim). |
| And the caller's code is simpler with Bcj2Enc_Encode(). |
| */ |
| |
| Z7_NO_INLINE |
| void Bcj2Enc_Encode(CBcj2Enc *p) |
| { |
| PRF2("\n----") |
| if (p->tempPos != 0) |
| { |
| /* extra: number of bytes that were copied from (src) to (temp) buffer in this call */ |
| unsigned extra = 0; |
| /* We will touch only minimal required number of bytes in input (src) stream. |
| So we will add input bytes from (src) stream to temp[] with step of 1 byte. |
| We don't add new bytes to temp[] before Bcj2Enc_Encode_2() call |
| in first loop iteration because |
| - previous call of Bcj2Enc_Encode() could use another (finishMode), |
| - previous call could finish with (p->state != BCJ2_ENC_STATE_ORIG). |
| the case with full temp[] buffer (p->tempPos == 4) is possible here. |
| */ |
| for (;;) |
| { |
| // (0 < p->tempPos <= 5) // in non-marker version |
| /* p->src : the current src data position including extra bytes |
| that were copied to temp[] buffer in this call */ |
| const Byte *src = p->src; |
| const Byte *srcLim = p->srcLim; |
| const EBcj2Enc_FinishMode finishMode = p->finishMode; |
| if (src != srcLim) |
| { |
| /* if there are some src data after the data copied to temp[], |
| then we use MODE_CONTINUE for temp data */ |
| p->finishMode = BCJ2_ENC_FINISH_MODE_CONTINUE; |
| } |
| p->src = p->temp; |
| p->srcLim = p->temp + p->tempPos; |
| PRF2(" ") |
| Bcj2Enc_Encode_2(p); |
| { |
| const unsigned num = (unsigned)(p->src - p->temp); |
| const unsigned tempPos = p->tempPos - num; |
| unsigned i; |
| p->tempPos = tempPos; |
| for (i = 0; i < tempPos; i++) |
| p->temp[i] = p->temp[(SizeT)i + num]; |
| // tempPos : number of bytes in temp buffer |
| p->src = src; |
| p->srcLim = srcLim; |
| p->finishMode = finishMode; |
| if (p->state != BCJ2_ENC_STATE_ORIG) |
| { |
| // (p->tempPos <= 4) // in non-marker version |
| /* if (the reason of exit from Bcj2Enc_Encode_2() |
| is not BCJ2_ENC_STATE_ORIG), |
| then we exit from Bcj2Enc_Encode() with same reason */ |
| // optional code begin : we rollback (src) and tempPos, if it's possible: |
| if (extra >= tempPos) |
| extra = tempPos; |
| p->src = src - extra; |
| p->tempPos = tempPos - extra; |
| // optional code end : rollback of (src) and tempPos |
| return; |
| } |
| /* (p->tempPos <= 4) |
| (p->state == BCJ2_ENC_STATE_ORIG) |
| so encoder needs more data than in temp[] */ |
| if (src == srcLim) |
| return; // src buffer has no more input data. |
| /* (src != srcLim) |
| so we can provide more input data from src for Bcj2Enc_Encode_2() */ |
| if (extra >= tempPos) |
| { |
| /* (extra >= tempPos) means that temp buffer contains |
| only data from src buffer of this call. |
| So now we can encode without temp buffer */ |
| p->src = src - tempPos; // rollback (src) |
| p->tempPos = 0; |
| break; |
| } |
| // we append one additional extra byte from (src) to temp[] buffer: |
| p->temp[tempPos] = *src; |
| p->tempPos = tempPos + 1; |
| // (0 < p->tempPos <= 5) // in non-marker version |
| p->src = src + 1; |
| extra++; |
| } |
| } |
| } |
| |
| PRF2("++++") |
| // (p->tempPos == 0) |
| Bcj2Enc_Encode_2(p); |
| PRF2("====") |
| |
| if (p->state == BCJ2_ENC_STATE_ORIG) |
| { |
| const Byte *src = p->src; |
| const Byte *srcLim = p->srcLim; |
| const unsigned rem = (unsigned)(srcLim - src); |
| /* (rem <= 4) here. |
| if (p->src != p->srcLim), then |
| - we copy non-processed bytes from (p->src) to temp[] buffer, |
| - we set p->src equal to p->srcLim. |
| */ |
| if (rem) |
| { |
| unsigned i = 0; |
| p->src = srcLim; |
| p->tempPos = rem; |
| // (0 < p->tempPos <= 4) |
| do |
| p->temp[i] = src[i]; |
| while (++i != rem); |
| } |
| // (p->tempPos <= 4) |
| // (p->src == p->srcLim) |
| } |
| } |
| |
| #undef PRF2 |
| #undef CONV_FLAG |
| #undef MARKER_FLAG |
| #undef WRITE_CONTEXT |
| #undef WRITE_CONTEXT_AND_SRC |
| #undef ONE_ITER |
| #undef NUM_SHIFT_BITS |
| #undef kTopValue |
| #undef kNumBitModelTotalBits |
| #undef kBitModelTotal |
| #undef kNumMoveBits |