Refactor string conversion between RCT2 and UTF8

Use Win32 API for conversion on Windows for non-1252 code pages instead of built-in tables.
2026-01-22 22:34:33 +01:00 · 2018-04-14 18:06:49 +01:00
parent 2512e4959c
commit deaa60f8f1
17 changed files with 343 additions and 250 deletions
--- a/src/openrct2/localisation/Convert.cpp
+++ b/src/openrct2/localisation/Convert.cpp
@@ -1,4 +1,4 @@
-#pragma region Copyright (c) 2014-2017 OpenRCT2 Developers
+#pragma region Copyright (c) 2014-2018 OpenRCT2 Developers
 /*****************************************************************************
 * OpenRCT2, an open source clone of Roller Coaster Tycoon 2.
 *
@@ -14,101 +14,172 @@
 *****************************************************************************/
 #pragma endregion

+#include <algorithm>
+#include <limits>
+#include <stdexcept>
+#include "../core/String.hpp"
 #include "../core/Util.hpp"
-#include "ConversionTables.h"
 #include "Localisation.h"

-sint32 rct2_to_utf8(utf8 *dst, const char *src)
+/**
+ * Decodes an RCT2 string to a wide char string still in the original code page.
+ * An RCT2 string is a multi-byte string where every two-byte code point is preceeded with a byte value of 255.
+ */
+static std::wstring DecodeToWideChar(const std::string_view& src)
 {
-    wchar_t codepoint;
+    std::wstring decoded;
+    decoded.reserve(src.size());
+    for (auto it = src.begin(); it != src.end(); it++)
+    {
+        uint8_t c = *it;
+        if (c == 255)
+        {
+            // Push next two characters
+            uint8 a = 0;
+            uint8 b = 0;
+            if (++it != src.end())
+            {
+                a = *it;
+                if (++it != src.end())
+                {
+                    b = *it;
+                }
+            }

-    utf8 *start = dst;
-    const char *ch = src;
-    while (*ch != 0) {
-        if (*ch == (char)(uint8)0xFF) {
-            ch++;
-
-            // Read wide char
-            uint8 a = *ch++;
-            uint8 b = *ch++;
-            codepoint = (a << 8) | b;
-        } else {
-            codepoint = (uint8)(*ch++);
-            codepoint = encoding_convert_rct2_to_unicode(codepoint);
+            wchar_t cp = (a << 8) | b;
+            decoded.push_back(cp);
+        }
+        else
+        {
+            // Push character
+            decoded.push_back(c);
        }
-
-        dst = utf8_write_codepoint(dst, codepoint);
    }
-    dst = utf8_write_codepoint(dst, 0);
-    return (sint32)(dst - start);
+    return decoded;
 }

-sint32 utf8_to_rct2(char *dst, const utf8 *src)
+static std::string DecodeToMultiByte(const std::string_view& src)
 {
-    char *start = dst;
-    const utf8 *ch = src;
+    auto wide = DecodeToWideChar(src);
+    std::string result;
+    result.reserve(wide.size());
+    for (auto cc : wide)
+    {
+        if (cc <= 255)
+        {
+            result.push_back(cc);
+        }
+        else
+        {
+            result.push_back((cc >> 8) & 0xFF);
+            result.push_back(cc & 0xFF);
+        }
+    }
+    return result;
+}
+
+/**
+ * Encodes a UTF-8 string as an RCT2 string.
+ */
+static std::string Encode(const std::string_view& src)
+{
+    std::string dst;
+    const utf8 * ch = src.data();
    sint32 codepoint;
-    while ((codepoint = utf8_get_next(ch, &ch)) != 0) {
+    while ((codepoint = utf8_get_next(ch, &ch)) != 0)
+    {
        codepoint = encoding_convert_unicode_to_rct2(codepoint);
-        if (codepoint < 256) {
-            *dst++ = (char)codepoint;
-        } else if (codepoint <= 0xFFFF) {
-            *dst++ = (char)(uint8)0xFF;
-            *dst++ = (codepoint >> 8) & 0xFF;
-            *dst++ = codepoint & 0xFF;
+        if (codepoint <= std::numeric_limits<uint8>::max())
+        {
+            dst.push_back(codepoint);
+        }
+        else if (codepoint <= std::numeric_limits<uint16>::max())
+        {
+            dst.push_back((char)(uint8)0xFF);
+            dst.push_back((codepoint >> 8) & 0xFF);
+            dst.push_back(codepoint & 0xFF);
+        }
+        else
+        {
+            // RCT2 strings do not support code points greater than 65535, replace them with '?'
+            dst.push_back('?');
        }
    }
-    *dst++ = 0;
-    return (sint32)(dst - start);
+    return dst;
 }

-static sint32 encoding_search_compare(const void *pKey, const void *pEntry)
+static sint32 GetCodePageForRCT2Language(RCT2LanguageId languageId)
 {
-    uint16 key = *((uint16*)pKey);
-    encoding_convert_entry *entry = (encoding_convert_entry*)pEntry;
-    if (key < entry->code) return -1;
-    if (key > entry->code) return 1;
-    return 0;
-}
-
-static wchar_t encoding_convert_x_to_unicode(wchar_t code, const encoding_convert_entry *table, size_t count)
-{
-    encoding_convert_entry * entry = (encoding_convert_entry *)bsearch(&code, table, count, sizeof(encoding_convert_entry), encoding_search_compare);
-    if (entry == nullptr) return code;
-    else return entry->unicode;
-}
-
-uint32 encoding_convert_unicode_to_rct2(uint32 unicode)
-{
-    // Can't do a binary search as it's sorted by RCT2 code, not unicode
-    for (uint32 i = 0; i < Util::CountOf(RCT2ToUnicodeTable); i++) {
-        if (RCT2ToUnicodeTable[i].unicode == unicode) return RCT2ToUnicodeTable[i].code;
+    switch (languageId)
+    {
+        case RCT2_LANGUAGE_ID_JAPANESE:
+            return CODE_PAGE::CP_932;
+        case RCT2_LANGUAGE_ID_CHINESE_SIMPLIFIED:
+            return CODE_PAGE::CP_936;
+        case RCT2_LANGUAGE_ID_KOREAN:
+            return CODE_PAGE::CP_949;
+        case RCT2_LANGUAGE_ID_CHINESE_TRADITIONAL:
+            return CODE_PAGE::CP_950;
+        default:
+            return CODE_PAGE::CP_1252;
    }
-    return unicode;
 }

-wchar_t encoding_convert_rct2_to_unicode(wchar_t rct2str)
+template<typename TConvertFunc>
+static std::string DecodeConvertWithTable(const std::string_view& src, TConvertFunc func)
 {
-    return encoding_convert_x_to_unicode(rct2str, RCT2ToUnicodeTable, Util::CountOf(RCT2ToUnicodeTable));
+    auto decoded = DecodeToWideChar(src);
+    std::wstring u16;
+    u16.reserve(decoded.size());
+    for (auto cc : decoded)
+    {
+        u16.push_back(func(cc));
+    }
+    return String::ToUtf8(u16);
 }

-wchar_t encoding_convert_gb2312_to_unicode(wchar_t gb2312)
+std::string rct2_to_utf8(const std::string_view& src, RCT2LanguageId languageId)
 {
-    return encoding_convert_x_to_unicode(gb2312 - 0x8080, GB2312ToUnicodeTable, Util::CountOf(GB2312ToUnicodeTable));
+    auto codePage = GetCodePageForRCT2Language(languageId);
+
+    std::string result;
+    switch (codePage)
+    {
+        case CODE_PAGE::CP_1252:
+            // The code page used by RCT2 was not quite 1252 as some codes were used for Polish characters.
+            result = DecodeConvertWithTable(src, encoding_convert_rct2_to_unicode);
+            break;
+
+#ifdef _WIN32
+        default:
+            auto decoded = DecodeToMultiByte(src);
+            result = String::Convert(decoded, codePage, CODE_PAGE::CP_UTF8);
+#else
+        // TODO Change this to use a library such as libicu
+        case CODE_PAGE::CP_932:
+            result = DecodeConvertWithTable(src, encoding_convert_cp932_to_unicode);
+            break;
+        case CODE_PAGE::CP_936:
+            result = DecodeConvertWithTable(src, encoding_convert_gb2312_to_unicode);
+            break;
+        case CODE_PAGE::CP_949:
+            result = DecodeConvertWithTable(src, encoding_convert_cp949_to_unicode);
+            break;
+        case CODE_PAGE::CP_950:
+            result = DecodeConvertWithTable(src, encoding_convert_big5_to_unicode);
+            break;
+        default:
+            throw std::runtime_error("Unsupported code page: " + std::to_string(codePage));
+            break;
+#endif
+    }
+    return result;
 }

-wchar_t encoding_convert_big5_to_unicode(wchar_t big5)
+std::string utf8_to_rct2(const std::string_view& src)
 {
-    return encoding_convert_x_to_unicode(big5, Big5ToUnicodeTable, Util::CountOf(Big5ToUnicodeTable));
+    // NOTE: This is only used for SC6 / SV6 files which don't store the language identifier
+    //       because of this, we can only store in RCT2's CP_1252 format. We can preserve some
+    //       unicode characters, but only those between 256 and 65535.
+    return Encode(src);
 }
-
-wchar_t encoding_convert_cp932_to_unicode(wchar_t cp932)
-{
-    return encoding_convert_x_to_unicode(cp932, CP932ToUnicodeTable, Util::CountOf(CP932ToUnicodeTable));
-}
-
-wchar_t encoding_convert_cp949_to_unicode(wchar_t cp949)
-{
-    return encoding_convert_x_to_unicode(cp949, CP949ToUnicodeTable, Util::CountOf(CP949ToUnicodeTable));
-}
-