| // Common/UTFConvert.h |
| |
| #ifndef ZIP7_INC_COMMON_UTF_CONVERT_H |
| #define ZIP7_INC_COMMON_UTF_CONVERT_H |
| |
| #include "MyBuffer.h" |
| #include "MyString.h" |
| |
| struct CUtf8Check |
| { |
| // Byte MaxByte; // in original src stream |
| bool NonUtf; |
| bool ZeroChar; |
| bool SingleSurrogate; |
| bool Escape; |
| bool Truncated; |
| UInt32 MaxHighPoint; // only for points >= 0x80 |
| |
| CUtf8Check() { Clear(); } |
| |
| void Clear() |
| { |
| // MaxByte = 0; |
| NonUtf = false; |
| ZeroChar = false; |
| SingleSurrogate = false; |
| Escape = false; |
| Truncated = false; |
| MaxHighPoint = 0; |
| } |
| |
| void Update(const CUtf8Check &c) |
| { |
| if (c.NonUtf) NonUtf = true; |
| if (c.ZeroChar) ZeroChar = true; |
| if (c.SingleSurrogate) SingleSurrogate = true; |
| if (c.Escape) Escape = true; |
| if (c.Truncated) Truncated = true; |
| if (MaxHighPoint < c.MaxHighPoint) MaxHighPoint = c.MaxHighPoint; |
| } |
| |
| void PrintStatus(AString &s) const |
| { |
| s.Empty(); |
| |
| // s.Add_OptSpaced("MaxByte="); |
| // s.Add_UInt32(MaxByte); |
| |
| if (NonUtf) s.Add_OptSpaced("non-UTF8"); |
| if (ZeroChar) s.Add_OptSpaced("ZeroChar"); |
| if (SingleSurrogate) s.Add_OptSpaced("SingleSurrogate"); |
| if (Escape) s.Add_OptSpaced("Escape"); |
| if (Truncated) s.Add_OptSpaced("Truncated"); |
| |
| if (MaxHighPoint != 0) |
| { |
| s.Add_OptSpaced("MaxUnicode="); |
| s.Add_UInt32(MaxHighPoint); |
| } |
| } |
| |
| |
| bool IsOK(bool allowReduced = false) const |
| { |
| if (NonUtf || SingleSurrogate || ZeroChar) |
| return false; |
| if (MaxHighPoint >= 0x110000) |
| return false; |
| if (Truncated && !allowReduced) |
| return false; |
| return true; |
| } |
| |
| // it checks full buffer as specified in (size) and it doesn't stop on zero char |
| void Check_Buf(const char *src, size_t size) throw(); |
| |
| void Check_AString(const AString &s) throw() |
| { |
| Check_Buf(s.Ptr(), s.Len()); |
| } |
| }; |
| |
| /* |
| if (allowReduced == false) - all UTF-8 character sequences must be finished. |
| if (allowReduced == true) - it allows truncated last character-Utf8-sequence |
| */ |
| |
| bool Check_UTF8_Buf(const char *src, size_t size, bool allowReduced) throw(); |
| bool CheckUTF8_AString(const AString &s) throw(); |
| |
| #define Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR (1 << 0) |
| #define Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE (1 << 1) |
| #define Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT (1 << 2) |
| |
| /* |
| Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR |
| |
| if (flag is NOT set) |
| { |
| it processes SINGLE-SURROGATE-8 as valid Unicode point. |
| it converts SINGLE-SURROGATE-8 to SINGLE-SURROGATE-16 |
| Note: some sequencies of two SINGLE-SURROGATE-8 points |
| will generate correct SURROGATE-16-PAIR, and |
| that SURROGATE-16-PAIR later will be converted to correct |
| UTF8-SURROGATE-21 point. So we don't restore original |
| STR-8 sequence in that case. |
| } |
| |
| if (flag is set) |
| { |
| if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is defined) |
| it generates ESCAPE for SINGLE-SURROGATE-8, |
| if (Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE is not defined) |
| it generates U+fffd for SINGLE-SURROGATE-8, |
| } |
| |
| |
| Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE |
| |
| if (flag is NOT set) |
| it generates (U+fffd) code for non-UTF-8 (invalid) characters |
| |
| if (flag is set) |
| { |
| It generates (ESCAPE) codes for NON-UTF-8 (invalid) characters. |
| And later we can restore original UTF-8-RAW characters from (ESCAPE-16-21) codes. |
| } |
| |
| Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT |
| |
| if (flag is NOT set) |
| { |
| it process ESCAPE-8 points as another Unicode points. |
| In Linux: ESCAPE-16 will mean two different ESCAPE-8 seqences, |
| so we need HIGH-ESCAPE-PLANE-21 to restore UTF-8-RAW -> UTF-16 -> UTF-8-RAW |
| } |
| |
| if (flag is set) |
| { |
| it generates ESCAPE-16-21 for ESCAPE-8 points |
| so we can restore UTF-8-RAW -> UTF-16 -> UTF-8-RAW without HIGH-ESCAPE-PLANE-21. |
| } |
| |
| |
| Main USE CASES with UTF-8 <-> UTF-16 conversions: |
| |
| WIN32: UTF-16-RAW -> UTF-8 (Archive) -> UTF-16-RAW |
| { |
| set Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE |
| Do NOT set Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR |
| Do NOT set Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT |
| |
| So we restore original SINGLE-SURROGATE-16 from single SINGLE-SURROGATE-8. |
| } |
| |
| Linux: UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW |
| { |
| we want restore original UTF-8-RAW sequence later from that ESCAPE-16. |
| Set the flags: |
| Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR |
| Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE |
| Z7_UTF_FLAG_FROM_UTF8_BMP_ESCAPE_CONVERT |
| } |
| |
| MacOS: UTF-8-RAW -> UTF-16 (Intermediate / Archive) -> UTF-8-RAW |
| { |
| we want to restore correct UTF-8 without any BMP processing: |
| Set the flags: |
| Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR |
| Z7_UTF_FLAG_FROM_UTF8_USE_ESCAPE |
| } |
| |
| */ |
| |
| // zero char is not allowed in (src) buf |
| bool Convert_UTF8_Buf_To_Unicode(const char *src, size_t srcSize, UString &dest, unsigned flags = 0); |
| |
| bool ConvertUTF8ToUnicode_Flags(const AString &src, UString &dest, unsigned flags = 0); |
| bool ConvertUTF8ToUnicode(const AString &src, UString &dest); |
| |
| #define Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR (1 << 8) |
| #define Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE (1 << 9) |
| // #define Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE (1 << 10) |
| |
| /* |
| Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR |
| |
| if (flag is NOT set) |
| { |
| we extract SINGLE-SURROGATE as normal UTF-8 |
| |
| In Windows : for UTF-16-RAW <-> UTF-8 (archive) <-> UTF-16-RAW in . |
| |
| In Linux : |
| use-case-1: UTF-8 -> UTF-16 -> UTF-8 doesn't generate UTF-16 SINGLE-SURROGATE, |
| if (Z7_UTF_FLAG_FROM_UTF8_SURROGATE_ERROR) is used. |
| use-case 2: UTF-16-7z (with SINGLE-SURROGATE from Windows) -> UTF-8 (Linux) |
| will generate SINGLE-SURROGATE-UTF-8 here. |
| } |
| |
| if (flag is set) |
| { |
| we generate UTF_REPLACEMENT_CHAR (0xfffd) for SINGLE_SURROGATE |
| it can be used for compatibility mode with WIN32 UTF function |
| or if we want UTF-8 stream without any errors |
| } |
| |
| |
| Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE |
| |
| if (flag is NOT set) it doesn't extract raw 8-bit symbol from Escape-Plane-16 |
| if (flag is set) it extracts raw 8-bit symbol from Escape-Plane-16 |
| |
| in Linux we need some way to extract NON-UTF8 RAW 8-bits from BMP (UTF-16 7z archive): |
| if (we use High-Escape-Plane), we can transfer BMP escapes to High-Escape-Plane. |
| if (we don't use High-Escape-Plane), we must use Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE. |
| |
| |
| Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE |
| // that flag affects the code only if (wchar_t is 32-bit) |
| // that mode with high-escape can be disabled now in UTFConvert.cpp |
| if (flag is NOT set) |
| it doesn't extract raw 8-bit symbol from High-Escape-Plane |
| if (flag is set) |
| it extracts raw 8-bit symbol from High-Escape-Plane |
| |
| Main use cases: |
| |
| WIN32 : UTF-16-RAW -> UTF-8 (archive) -> UTF-16-RAW |
| { |
| Do NOT set Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE. |
| Do NOT set Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR. |
| So we restore original UTF-16-RAW. |
| } |
| |
| Linix : UTF-8 with Escapes -> UTF-16 (7z archive) -> UTF-8 with Escapes |
| set Z7_UTF_FLAG_TO_UTF8_EXTRACT_BMP_ESCAPE to extract non-UTF from 7z archive |
| set Z7_UTF_FLAG_TO_UTF8_PARSE_HIGH_ESCAPE for intermediate UTF-16. |
| Note: high esacape mode can be ignored now in UTFConvert.cpp |
| |
| macOS: |
| the system doesn't support incorrect UTF-8 in file names. |
| set Z7_UTF_FLAG_TO_UTF8_SURROGATE_ERROR |
| */ |
| |
| extern unsigned g_Unicode_To_UTF8_Flags; |
| |
| void ConvertUnicodeToUTF8_Flags(const UString &src, AString &dest, unsigned flags = 0); |
| void ConvertUnicodeToUTF8(const UString &src, AString &dest); |
| |
| void Convert_Unicode_To_UTF8_Buf(const UString &src, CByteBuffer &dest); |
| |
| /* |
| #ifndef _WIN32 |
| void Convert_UTF16_To_UTF32(const UString &src, UString &dest); |
| void Convert_UTF32_To_UTF16(const UString &src, UString &dest); |
| bool UTF32_IsThere_BigPoint(const UString &src); |
| bool Unicode_IsThere_BmpEscape(const UString &src); |
| #endif |
| |
| bool Unicode_IsThere_Utf16SurrogateError(const UString &src); |
| */ |
| |
| #ifdef Z7_WCHART_IS_16BIT |
| #define Convert_UnicodeEsc16_To_UnicodeEscHigh(s) |
| #else |
| void Convert_UnicodeEsc16_To_UnicodeEscHigh(UString &s); |
| #endif |
| |
| /* |
| // #include "../../C/CpuArch.h" |
| |
| // ---------- Utf16 Little endian functions ---------- |
| |
| // We store 16-bit surrogates even in 32-bit WCHARs in Linux. |
| // So now we don't use the following code: |
| |
| #if WCHAR_MAX > 0xffff |
| |
| // void *p : pointer to src bytes stream |
| // size_t len : num Utf16 characters : it can include or not include NULL character |
| |
| inline size_t Utf16LE__Get_Num_WCHARs(const void *p, size_t len) |
| { |
| #if WCHAR_MAX > 0xffff |
| size_t num_wchars = 0; |
| for (size_t i = 0; i < len; i++) |
| { |
| wchar_t c = GetUi16(p); |
| p = (const void *)((const Byte *)p + 2); |
| if (c >= 0xd800 && c < 0xdc00 && i + 1 != len) |
| { |
| wchar_t c2 = GetUi16(p); |
| if (c2 >= 0xdc00 && c2 < 0xe000) |
| { |
| c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff); |
| p = (const void *)((const Byte *)p + 2); |
| i++; |
| } |
| } |
| num_wchars++; |
| } |
| return num_wchars; |
| #else |
| UNUSED_VAR(p) |
| return len; |
| #endif |
| } |
| |
| // #include <stdio.h> |
| |
| inline wchar_t *Utf16LE__To_WCHARs_Sep(const void *p, size_t len, wchar_t *dest) |
| { |
| for (size_t i = 0; i < len; i++) |
| { |
| wchar_t c = GetUi16(p); |
| p = (const void *)((const Byte *)p + 2); |
| |
| #if WCHAR_PATH_SEPARATOR != L'/' |
| if (c == L'/') |
| c = WCHAR_PATH_SEPARATOR; |
| #endif |
| |
| #if WCHAR_MAX > 0xffff |
| |
| if (c >= 0xd800 && c < 0xdc00 && i + 1 != len) |
| { |
| wchar_t c2 = GetUi16(p); |
| if (c2 >= 0xdc00 && c2 < 0xe000) |
| { |
| // printf("\nSurragate : %4x %4x -> ", (int)c, (int)c2); |
| c = 0x10000 + ((c & 0x3ff) << 10) + (c2 & 0x3ff); |
| p = (const void *)((const Byte *)p + 2); |
| i++; |
| // printf("%4x\n", (int)c); |
| } |
| } |
| |
| #endif |
| |
| *dest++ = c; |
| } |
| return dest; |
| } |
| |
| |
| inline size_t Get_Num_Utf16_chars_from_wchar_string(const wchar_t *p) |
| { |
| size_t num = 0; |
| for (;;) |
| { |
| wchar_t c = *p++; |
| if (c == 0) |
| return num; |
| num += ((c >= 0x10000 && c < 0x110000) ? 2 : 1); |
| } |
| return num; |
| } |
| |
| inline Byte *wchars_to_Utf16LE(const wchar_t *p, Byte *dest) |
| { |
| for (;;) |
| { |
| wchar_t c = *p++; |
| if (c == 0) |
| return dest; |
| if (c >= 0x10000 && c < 0x110000) |
| { |
| SetUi16(dest , (UInt16)(0xd800 + ((c >> 10) & 0x3FF))); |
| SetUi16(dest + 2, (UInt16)(0xdc00 + ( c & 0x3FF))); |
| dest += 4; |
| } |
| else |
| { |
| SetUi16(dest, c); |
| dest += 2; |
| } |
| } |
| } |
| |
| #endif |
| */ |
| |
| #endif |