Merge pull request #7414 from OpenRCT2/refactor/string-conversions

Refactor string conversion between RCT2 and UTF8
2025-12-10 09:32:29 +01:00 · 2018-04-25 10:14:39 +01:00
parent 2e646c6733 6d76d76f98
commit 42f46d15e3
23 changed files with 512 additions and 251 deletions
--- a/.travis.yml
+++ b/.travis.yml
@@ -92,7 +92,7 @@ matrix:
            - docker
        - os: osx
          if: type != cron
-          osx_image: xcode8.3
+          osx_image: xcode9.3
          env:
            - secure: "OXn/i72FxW/oh6RGlaN+gHSbkt1ToFe36etaiDOsJQznt6fe9CpFdnE8U1XBHlGokcEjbGNErRU7CFDKYHQuGrPZyHXwgqG2/0emIqFaFt5ti5ypyYKf5qH9x1LLLfdZxDyHkxXdlJ7Etxbp3G7qrV8CGRQiYRNHm1f98AmuufE="
          after_success:
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -127,7 +127,7 @@ else ()
        set(PIE_FLAG "-fpie")
    endif ()
    
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++14")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17")
 endif ()

 # Defines
--- a/src/openrct2/Game.cpp
+++ b/src/openrct2/Game.cpp
@@ -1103,11 +1103,10 @@ static void load_landscape()

 void utf8_to_rct2_self(char * buffer, size_t length)
 {
-    char tempBuffer[512];
-    utf8_to_rct2(tempBuffer, buffer);
+    auto temp = utf8_to_rct2(buffer);

    size_t       i   = 0;
-    const char * src = tempBuffer;
+    const char * src = temp.data();
    char       * dst = buffer;
    while (*src != 0 && i < length - 1)
    {
@@ -1143,9 +1142,8 @@ void rct2_to_utf8_self(char * buffer, size_t length)
 {
    if (length > 0)
    {
-        char tempBuffer[512];
-        rct2_to_utf8(tempBuffer, buffer);
-        safe_strcpy(buffer, tempBuffer, length);
+        auto temp = rct2_to_utf8(buffer, RCT2_LANGUAGE_ID_ENGLISH_UK);
+        safe_strcpy(buffer, temp.data(), length);
    }
 }

--- a/src/openrct2/core/IStream.cpp
+++ b/src/openrct2/core/IStream.cpp
@@ -37,17 +37,13 @@ utf8 * IStream::ReadString()

 std::string IStream::ReadStdString()
 {
-    std::vector<utf8> result;
-
+    std::string result;
    uint8 ch;
    while ((ch = ReadValue<uint8>()) != 0)
    {
        result.push_back(ch);
    }
-    result.push_back(0);
-
-    std::string resultString(result.data(), result.data() + result.size());
-    return resultString;
+    return result;
 }

 void IStream::WriteString(const utf8 * str)
--- a/src/openrct2/core/String.cpp
+++ b/src/openrct2/core/String.cpp
@@ -18,6 +18,14 @@
 #include <stdexcept>
 #include <vector>

+#ifdef _WIN32
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+#endif
+
 #include "../localisation/Language.h"
 #include "../util/Util.h"

@@ -517,4 +525,32 @@ namespace String
        size_t stringLength = endSubstr - startSubstr + 1;
        return std::string(startSubstr, stringLength);
    }
+
+    std::string Convert(const std::string_view& src, sint32 srcCodePage, sint32 dstCodePage)
+    {
+#ifdef _WIN32
+        // Convert from source code page to UTF-16
+        std::wstring u16;
+        {
+            int srcLen = (int)src.size();
+            int sizeReq = MultiByteToWideChar(srcCodePage, 0, src.data(), srcLen, nullptr, 0);
+            u16 = std::wstring(sizeReq, 0);
+            MultiByteToWideChar(srcCodePage, 0, src.data(), srcLen, u16.data(), sizeReq);
+        }
+
+        // Convert from UTF-16 to destination code page
+        std::string dst;
+        {
+            int srcLen = (int)u16.size();
+            int sizeReq = WideCharToMultiByte(dstCodePage, 0, u16.data(), srcLen, nullptr, 0, nullptr, nullptr);
+            dst = std::string(sizeReq, 0);
+            WideCharToMultiByte(dstCodePage, 0, u16.data(), srcLen, dst.data(), sizeReq, nullptr, nullptr);
+        }
+
+        return dst;
+#else
+        STUB();
+        return std::string(src);
+#endif
+    }
 }
--- a/src/openrct2/core/String.hpp
+++ b/src/openrct2/core/String.hpp
@@ -22,6 +22,19 @@
 #include <vector>
 #include "../common.h"

+namespace CODE_PAGE
+{
+    // windows.h defines CP_UTF8
+#undef CP_UTF8
+
+    constexpr sint32 CP_932 = 932;      // ANSI/OEM Japanese; Japanese (Shift-JIS)
+    constexpr sint32 CP_936 = 936;      // ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312)
+    constexpr sint32 CP_949 = 949;      // ANSI/OEM Korean (Unified Hangul Code)
+    constexpr sint32 CP_950 = 950;      // ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5)
+    constexpr sint32 CP_1252 = 1252;    // ANSI Latin 1; Western European (Windows)
+    constexpr sint32 CP_UTF8 = 65001;   // Unicode (UTF-8)
+}
+
 namespace String
 {
    constexpr const utf8 * Empty = "";
@@ -91,4 +104,9 @@ namespace String
    utf8 *          TrimStart(utf8 * buffer, size_t bufferSize, const utf8 * src);
    std::string     TrimStart(const std::string &s);
    std::string     Trim(const std::string &s);
+
+    /**
+     * Converts a multi-byte string from one code page to another.
+     */
+    std::string Convert(const std::string_view& src, sint32 srcCodePage, sint32 dstCodePage);
 }
--- a/src/openrct2/drawing/Font.cpp
+++ b/src/openrct2/drawing/Font.cpp
@@ -15,7 +15,6 @@
 #pragma endregion

 #include "../core/Util.hpp"
-#include "../localisation/ConversionTables.h"
 #include "../localisation/FormatCodes.h"
 #include "../localisation/Language.h"
 #include "../sprites.h"
--- a/src/openrct2/localisation/ConversionTables.cpp
+++ b/src/openrct2/localisation/ConversionTables.cpp
@@ -14,8 +14,16 @@
 *****************************************************************************/
 #pragma endregion

-#include "ConversionTables.h"
+#include <cstdlib>
+#include "../core/Util.hpp"
 #include "FormatCodes.h"
+#include "Localisation.h"
+
+struct encoding_convert_entry
+{
+    uint16 code;
+    uint32 unicode;
+};

 // clang-format off
 const encoding_convert_entry RCT2ToUnicodeTable[256] =
@@ -277,7 +285,44 @@ const encoding_convert_entry RCT2ToUnicodeTable[256] =
    { RCT2_Z_ACUTE, UNICODE_Z_ACUTE },
    { 255, 255 }
 };
-    
+
+static sint32 encoding_search_compare(const void *pKey, const void *pEntry)
+{
+    uint16 key = *((uint16*)pKey);
+    encoding_convert_entry *entry = (encoding_convert_entry*)pEntry;
+    if (key < entry->code) return -1;
+    if (key > entry->code) return 1;
+    return 0;
+}
+
+static wchar_t encoding_convert_x_to_unicode(wchar_t code, const encoding_convert_entry *table, size_t count)
+{
+    encoding_convert_entry * entry = (encoding_convert_entry *)std::bsearch(&code, table, count, sizeof(encoding_convert_entry), encoding_search_compare);
+    if (entry == nullptr) return code;
+    else return entry->unicode;
+}
+
+wchar_t encoding_convert_rct2_to_unicode(wchar_t rct2str)
+{
+    return encoding_convert_x_to_unicode(rct2str, RCT2ToUnicodeTable, Util::CountOf(RCT2ToUnicodeTable));
+}
+
+uint32 encoding_convert_unicode_to_rct2(uint32 unicode)
+{
+    // Can't do a binary search as it's sorted by RCT2 code, not unicode
+    for (const auto& entry : RCT2ToUnicodeTable)
+    {
+        if (entry.unicode == unicode)
+        {
+            return entry.code;
+        }
+    }
+    return unicode;
+}
+
+ 
+#ifndef _WIN32
+
 const encoding_convert_entry GB2312ToUnicodeTable[7445] =
 {
    { 8481, 12288 },
@@ -46295,3 +46340,25 @@ const encoding_convert_entry CP949ToUnicodeTable[17176] =
    { 0xFDFE, 0x8A70 }, // CJK UNIFIED IDEOGRAPH
 };
 //clang-format on
+
+wchar_t encoding_convert_gb2312_to_unicode(wchar_t gb2312)
+{
+    return encoding_convert_x_to_unicode(gb2312 - 0x8080, GB2312ToUnicodeTable, Util::CountOf(GB2312ToUnicodeTable));
+}
+
+wchar_t encoding_convert_big5_to_unicode(wchar_t big5)
+{
+    return encoding_convert_x_to_unicode(big5, Big5ToUnicodeTable, Util::CountOf(Big5ToUnicodeTable));
+}
+
+wchar_t encoding_convert_cp932_to_unicode(wchar_t cp932)
+{
+    return encoding_convert_x_to_unicode(cp932, CP932ToUnicodeTable, Util::CountOf(CP932ToUnicodeTable));
+}
+
+wchar_t encoding_convert_cp949_to_unicode(wchar_t cp949)
+{
+    return encoding_convert_x_to_unicode(cp949, CP949ToUnicodeTable, Util::CountOf(CP949ToUnicodeTable));
+}
+
+#endif
--- a/src/openrct2/localisation/ConversionTables.h
+++ b/src/openrct2/localisation/ConversionTables.h
@@ -1,73 +0,0 @@
-#pragma region Copyright (c) 2014-2018 OpenRCT2 Developers
-/*****************************************************************************
- * OpenRCT2, an open source clone of Roller Coaster Tycoon 2.
- *
- * OpenRCT2 is the work of many authors, a full list can be found in contributors.md
- * For more information, visit https://github.com/OpenRCT2/OpenRCT2
- *
- * OpenRCT2 is free software: you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation, either version 3 of the License, or
- * (at your option) any later version.
- *
- * A full copy of the GNU General Public License can be found in licence.txt
- *****************************************************************************/
-#pragma endregion
-
-#pragma once
-
-#include "../common.h"
-
-struct encoding_convert_entry
-{
-    uint16 code;
-    uint32 unicode;
-};
-
-extern const encoding_convert_entry GB2312ToUnicodeTable[7445];
-extern const encoding_convert_entry Big5ToUnicodeTable[13710];
-extern const encoding_convert_entry RCT2ToUnicodeTable[256];
-extern const encoding_convert_entry CP932ToUnicodeTable[7916];
-extern const encoding_convert_entry CP949ToUnicodeTable[17176];
-
-enum RCT2Polish
-{
-    RCT2_A_OGONEK_UC = 159, // 0x9F
-    RCT2_C_ACUTE_UC =  162, // 0xA2
-    RCT2_E_OGONEK_UC = 166, // 0xA6
-    RCT2_N_ACUTE_UC =  198, // 0xC6
-    RCT2_L_STROKE_UC = 167, // 0xA7
-    RCT2_S_ACUTE_UC =  208, // 0xD0
-    RCT2_Z_DOT_UC =    216, // 0xD8
-    RCT2_Z_ACUTE_UC =  215, // 0xD7
-
-    RCT2_A_OGONEK =    221, // 0xDD
-    RCT2_C_ACUTE =     222, // 0xDE
-    RCT2_E_OGONEK =    230, // 0xE6
-    RCT2_N_ACUTE =     240, // 0xF0
-    RCT2_L_STROKE =    247, // 0xF7
-    RCT2_S_ACUTE =     248, // 0xF8
-    RCT2_Z_DOT =       253, // 0xFD
-    RCT2_Z_ACUTE =     254, // 0xFE
-};
-
-enum UnicodePolish
-{
-    UNICODE_A_OGONEK_UC = 260,
-    UNICODE_C_ACUTE_UC = 262,
-    UNICODE_E_OGONEK_UC = 280,
-    UNICODE_N_ACUTE_UC = 323,
-    UNICODE_L_STROKE_UC = 321,
-    UNICODE_S_ACUTE_UC = 346,
-    UNICODE_Z_DOT_UC = 379,
-    UNICODE_Z_ACUTE_UC = 377,
-
-    UNICODE_A_OGONEK = 261,
-    UNICODE_C_ACUTE = 263,
-    UNICODE_E_OGONEK = 281,
-    UNICODE_N_ACUTE = 324,
-    UNICODE_L_STROKE = 322,
-    UNICODE_S_ACUTE = 347,
-    UNICODE_Z_DOT = 380,
-    UNICODE_Z_ACUTE = 378,
-};
--- a/src/openrct2/localisation/Convert.cpp
+++ b/src/openrct2/localisation/Convert.cpp
@@ -1,4 +1,4 @@
-#pragma region Copyright (c) 2014-2017 OpenRCT2 Developers
+#pragma region Copyright (c) 2014-2018 OpenRCT2 Developers
 /*****************************************************************************
 * OpenRCT2, an open source clone of Roller Coaster Tycoon 2.
 *
@@ -14,101 +14,182 @@
 *****************************************************************************/
 #pragma endregion

+#include <algorithm>
+#include <limits>
+#include <stdexcept>
+#include "../core/String.hpp"
 #include "../core/Util.hpp"
-#include "ConversionTables.h"
 #include "Localisation.h"

-sint32 rct2_to_utf8(utf8 *dst, const char *src)
+/**
+ * Decodes an RCT2 string to a wide char string still in the original code page.
+ * An RCT2 string is a multi-byte string where every two-byte code point is preceeded with a byte value of 255.
+ */
+static std::wstring DecodeToWideChar(const std::string_view& src)
 {
-    wchar_t codepoint;
+    std::wstring decoded;
+    decoded.reserve(src.size());
+    for (auto it = src.begin(); it != src.end(); )
+    {
+        uint8_t c = *it++;
+        if (c == 255)
+        {
+            // Push next two characters
+            uint8 a = 0;
+            uint8 b = 0;
+            if (it != src.end())
+            {
+                a = *it++;
+                if (it != src.end())
+                {
+                    b = *it++;
+                }
+                else
+                {
+                    // 2nd byte for double byte character is missing
+                    break;
+                }
+            }
+            else
+            {
+                // 1st byte for double byte character is missing
+                break;
+            }

-    utf8 *start = dst;
-    const char *ch = src;
-    while (*ch != 0) {
-        if (*ch == (char)(uint8)0xFF) {
-            ch++;
-
-            // Read wide char
-            uint8 a = *ch++;
-            uint8 b = *ch++;
-            codepoint = (a << 8) | b;
-        } else {
-            codepoint = (uint8)(*ch++);
-            codepoint = encoding_convert_rct2_to_unicode(codepoint);
+            wchar_t cp = (a << 8) | b;
+            decoded.push_back(cp);
+        }
+        else
+        {
+            // Push character
+            decoded.push_back(c);
        }
-
-        dst = utf8_write_codepoint(dst, codepoint);
    }
-    dst = utf8_write_codepoint(dst, 0);
-    return (sint32)(dst - start);
+    return decoded;
 }

-sint32 utf8_to_rct2(char *dst, const utf8 *src)
+static std::string DecodeToMultiByte(const std::string_view& src)
 {
-    char *start = dst;
-    const utf8 *ch = src;
+    auto wide = DecodeToWideChar(src);
+    std::string result;
+    result.reserve(wide.size());
+    for (auto cc : wide)
+    {
+        if (cc <= 255)
+        {
+            result.push_back(cc);
+        }
+        else
+        {
+            result.push_back((cc >> 8) & 0xFF);
+            result.push_back(cc & 0xFF);
+        }
+    }
+    return result;
+}
+
+/**
+ * Encodes a UTF-8 string as an RCT2 string.
+ */
+static std::string Encode(const std::string_view& src)
+{
+    std::string dst;
+    const utf8 * ch = src.data();
    sint32 codepoint;
-    while ((codepoint = utf8_get_next(ch, &ch)) != 0) {
+    while ((codepoint = utf8_get_next(ch, &ch)) != 0)
+    {
        codepoint = encoding_convert_unicode_to_rct2(codepoint);
-        if (codepoint < 256) {
-            *dst++ = (char)codepoint;
-        } else if (codepoint <= 0xFFFF) {
-            *dst++ = (char)(uint8)0xFF;
-            *dst++ = (codepoint >> 8) & 0xFF;
-            *dst++ = codepoint & 0xFF;
+        if (codepoint <= std::numeric_limits<uint8>::max())
+        {
+            dst.push_back(codepoint);
+        }
+        else if (codepoint <= std::numeric_limits<uint16>::max())
+        {
+            dst.push_back((char)(uint8)0xFF);
+            dst.push_back((codepoint >> 8) & 0xFF);
+            dst.push_back(codepoint & 0xFF);
+        }
+        else
+        {
+            // RCT2 strings do not support code points greater than 65535, replace them with '?'
+            dst.push_back('?');
        }
    }
-    *dst++ = 0;
-    return (sint32)(dst - start);
+    return dst;
 }

-static sint32 encoding_search_compare(const void *pKey, const void *pEntry)
+static sint32 GetCodePageForRCT2Language(RCT2LanguageId languageId)
 {
-    uint16 key = *((uint16*)pKey);
-    encoding_convert_entry *entry = (encoding_convert_entry*)pEntry;
-    if (key < entry->code) return -1;
-    if (key > entry->code) return 1;
-    return 0;
-}
-
-static wchar_t encoding_convert_x_to_unicode(wchar_t code, const encoding_convert_entry *table, size_t count)
-{
-    encoding_convert_entry * entry = (encoding_convert_entry *)bsearch(&code, table, count, sizeof(encoding_convert_entry), encoding_search_compare);
-    if (entry == nullptr) return code;
-    else return entry->unicode;
-}
-
-uint32 encoding_convert_unicode_to_rct2(uint32 unicode)
-{
-    // Can't do a binary search as it's sorted by RCT2 code, not unicode
-    for (uint32 i = 0; i < Util::CountOf(RCT2ToUnicodeTable); i++) {
-        if (RCT2ToUnicodeTable[i].unicode == unicode) return RCT2ToUnicodeTable[i].code;
+    switch (languageId)
+    {
+        case RCT2_LANGUAGE_ID_JAPANESE:
+            return CODE_PAGE::CP_932;
+        case RCT2_LANGUAGE_ID_CHINESE_SIMPLIFIED:
+            return CODE_PAGE::CP_936;
+        case RCT2_LANGUAGE_ID_KOREAN:
+            return CODE_PAGE::CP_949;
+        case RCT2_LANGUAGE_ID_CHINESE_TRADITIONAL:
+            return CODE_PAGE::CP_950;
+        default:
+            return CODE_PAGE::CP_1252;
    }
-    return unicode;
 }

-wchar_t encoding_convert_rct2_to_unicode(wchar_t rct2str)
+template<typename TConvertFunc>
+static std::string DecodeConvertWithTable(const std::string_view& src, TConvertFunc func)
 {
-    return encoding_convert_x_to_unicode(rct2str, RCT2ToUnicodeTable, Util::CountOf(RCT2ToUnicodeTable));
+    auto decoded = DecodeToWideChar(src);
+    std::wstring u16;
+    u16.reserve(decoded.size());
+    for (auto cc : decoded)
+    {
+        u16.push_back(func(cc));
+    }
+    return String::ToUtf8(u16);
 }

-wchar_t encoding_convert_gb2312_to_unicode(wchar_t gb2312)
+std::string rct2_to_utf8(const std::string_view& src, RCT2LanguageId languageId)
 {
-    return encoding_convert_x_to_unicode(gb2312 - 0x8080, GB2312ToUnicodeTable, Util::CountOf(GB2312ToUnicodeTable));
+    auto codePage = GetCodePageForRCT2Language(languageId);
+
+    std::string result;
+    switch (codePage)
+    {
+        case CODE_PAGE::CP_1252:
+            // The code page used by RCT2 was not quite 1252 as some codes were used for Polish characters.
+            result = DecodeConvertWithTable(src, encoding_convert_rct2_to_unicode);
+            break;
+
+#ifdef _WIN32
+        default:
+            auto decoded = DecodeToMultiByte(src);
+            result = String::Convert(decoded, codePage, CODE_PAGE::CP_UTF8);
+#else
+        // TODO Change this to use a library such as libicu
+        case CODE_PAGE::CP_932:
+            result = DecodeConvertWithTable(src, encoding_convert_cp932_to_unicode);
+            break;
+        case CODE_PAGE::CP_936:
+            result = DecodeConvertWithTable(src, encoding_convert_gb2312_to_unicode);
+            break;
+        case CODE_PAGE::CP_949:
+            result = DecodeConvertWithTable(src, encoding_convert_cp949_to_unicode);
+            break;
+        case CODE_PAGE::CP_950:
+            result = DecodeConvertWithTable(src, encoding_convert_big5_to_unicode);
+            break;
+        default:
+            throw std::runtime_error("Unsupported code page: " + std::to_string(codePage));
+            break;
+#endif
+    }
+    return result;
 }

-wchar_t encoding_convert_big5_to_unicode(wchar_t big5)
+std::string utf8_to_rct2(const std::string_view& src)
 {
-    return encoding_convert_x_to_unicode(big5, Big5ToUnicodeTable, Util::CountOf(Big5ToUnicodeTable));
+    // NOTE: This is only used for SC6 / SV6 files which don't store the language identifier
+    //       because of this, we can only store in RCT2's CP_1252 format. We can preserve some
+    //       unicode characters, but only those between 256 and 65535.
+    return Encode(src);
 }
-
-wchar_t encoding_convert_cp932_to_unicode(wchar_t cp932)
-{
-    return encoding_convert_x_to_unicode(cp932, CP932ToUnicodeTable, Util::CountOf(CP932ToUnicodeTable));
-}
-
-wchar_t encoding_convert_cp949_to_unicode(wchar_t cp949)
-{
-    return encoding_convert_x_to_unicode(cp949, CP949ToUnicodeTable, Util::CountOf(CP949ToUnicodeTable));
-}
-
--- a/src/openrct2/localisation/FormatCodes.h
+++ b/src/openrct2/localisation/FormatCodes.h
@@ -139,4 +139,46 @@ enum {
    FORMAT_COMMA1DP16 = 20004
 };

+enum RCT2Polish
+{
+    RCT2_A_OGONEK_UC = 159, // 0x9F
+    RCT2_C_ACUTE_UC = 162, // 0xA2
+    RCT2_E_OGONEK_UC = 166, // 0xA6
+    RCT2_N_ACUTE_UC = 198, // 0xC6
+    RCT2_L_STROKE_UC = 167, // 0xA7
+    RCT2_S_ACUTE_UC = 208, // 0xD0
+    RCT2_Z_DOT_UC = 216, // 0xD8
+    RCT2_Z_ACUTE_UC = 215, // 0xD7
+
+    RCT2_A_OGONEK = 221, // 0xDD
+    RCT2_C_ACUTE = 222, // 0xDE
+    RCT2_E_OGONEK = 230, // 0xE6
+    RCT2_N_ACUTE = 240, // 0xF0
+    RCT2_L_STROKE = 247, // 0xF7
+    RCT2_S_ACUTE = 248, // 0xF8
+    RCT2_Z_DOT = 253, // 0xFD
+    RCT2_Z_ACUTE = 254, // 0xFE
+};
+
+enum UnicodePolish
+{
+    UNICODE_A_OGONEK_UC = 260,
+    UNICODE_C_ACUTE_UC = 262,
+    UNICODE_E_OGONEK_UC = 280,
+    UNICODE_N_ACUTE_UC = 323,
+    UNICODE_L_STROKE_UC = 321,
+    UNICODE_S_ACUTE_UC = 346,
+    UNICODE_Z_DOT_UC = 379,
+    UNICODE_Z_ACUTE_UC = 377,
+
+    UNICODE_A_OGONEK = 261,
+    UNICODE_C_ACUTE = 263,
+    UNICODE_E_OGONEK = 281,
+    UNICODE_N_ACUTE = 324,
+    UNICODE_L_STROKE = 322,
+    UNICODE_S_ACUTE = 347,
+    UNICODE_Z_DOT = 380,
+    UNICODE_Z_ACUTE = 378,
+};
+
 #endif
--- a/src/openrct2/localisation/Language.cpp
+++ b/src/openrct2/localisation/Language.cpp
@@ -176,79 +176,6 @@ void language_close_all()
 constexpr rct_string_id NONSTEX_BASE_STRING_ID = 3463;
 constexpr uint16        MAX_OBJECT_CACHED_STRINGS = 2048;

-static wchar_t convert_specific_language_character_to_unicode(RCT2LanguageId languageId, wchar_t codepoint)
-{
-    switch (languageId) {
-    case RCT2_LANGUAGE_ID_CHINESE_TRADITIONAL:
-        return encoding_convert_big5_to_unicode(codepoint);
-    case RCT2_LANGUAGE_ID_CHINESE_SIMPLIFIED:
-        return encoding_convert_gb2312_to_unicode(codepoint);
-    case RCT2_LANGUAGE_ID_JAPANESE:
-        return encoding_convert_cp932_to_unicode(codepoint);
-    case RCT2_LANGUAGE_ID_KOREAN:
-        return encoding_convert_cp949_to_unicode(codepoint);
-    default:
-        return codepoint;
-    }
-}
-
-static utf8 * convert_multibyte_charset(const char * src, size_t srcMaxSize, RCT2LanguageId languageId)
-{
-    constexpr char CODEPOINT_DOUBLEBYTE = (char)(uint8)0xFF;
-
-    auto sb = StringBuilder(64);
-    for (const char * ch = src; (ch < src + srcMaxSize) && (*ch != '\0');)
-    {
-        if (*ch == CODEPOINT_DOUBLEBYTE)
-        {
-            ch++;
-            if (ch < src + srcMaxSize)
-            {
-                uint8 a = *ch++;
-                if (a != '\0')
-                {
-                    uint8 b = *ch++;
-                    wchar_t codepoint16 = (wchar_t)((a << 8) | b);
-
-                    codepoint16 = convert_specific_language_character_to_unicode(languageId, codepoint16);
-                    sb.Append(codepoint16);
-                }
-            }
-        }
-        else
-        {
-            codepoint_t codepoint = (uint8)*ch++;
-            sb.Append(codepoint);
-        }
-    }
-    return sb.StealString();
-}
-
-static bool rct2_language_is_multibyte_charset(RCT2LanguageId languageId)
-{
-    switch (languageId) {
-    case RCT2_LANGUAGE_ID_KOREAN:
-    case RCT2_LANGUAGE_ID_CHINESE_TRADITIONAL:
-    case RCT2_LANGUAGE_ID_CHINESE_SIMPLIFIED:
-    case RCT2_LANGUAGE_ID_JAPANESE:
-        return true;
-    default:
-        return false;
-    }
-}
-
-utf8 * rct2_language_string_to_utf8(const char *src, size_t srcSize, RCT2LanguageId languageId)
-{
-    if (rct2_language_is_multibyte_charset(languageId))
-    {
-        return convert_multibyte_charset(src, srcSize, languageId);
-    }
-    else
-    {
-        return win1252_to_utf8_alloc(src, srcSize);
-    }
-}
-
 bool language_get_localised_scenario_strings(const utf8 *scenarioFilename, rct_string_id *outStringIds)
 {
    outStringIds[0] = _languageCurrent->GetScenarioOverrideStringId(scenarioFilename, 0);
--- a/src/openrct2/localisation/Language.h
+++ b/src/openrct2/localisation/Language.h
@@ -18,6 +18,7 @@
 #define _LANGUAGE_H_

 #include <string>
+#include <string_view>
 #include "../common.h"
 #include "../drawing/Font.h"

@@ -109,7 +110,8 @@ sint32 utf8_length(const utf8 *text);
 wchar_t *utf8_to_widechar(const utf8 *src);
 utf8 *widechar_to_utf8(const wchar_t *src);

-utf8 *rct2_language_string_to_utf8(const char *src, size_t srcSize, RCT2LanguageId languageId);
+std::string rct2_to_utf8(const std::string_view& src, RCT2LanguageId languageId);
+std::string utf8_to_rct2(const std::string_view& src);
 bool language_get_localised_scenario_strings(const utf8 *scenarioFilename, rct_string_id *outStringIds);
 void language_free_object_string(rct_string_id stringId);
 rct_string_id language_get_object_override_string_id(const char * identifier, uint8 index);
--- a/src/openrct2/localisation/Localisation.h
+++ b/src/openrct2/localisation/Localisation.h
@@ -53,15 +53,18 @@ bool is_user_string_id(rct_string_id stringId);
 utf8 *win1252_to_utf8_alloc(const char *src, size_t srcMaxSize);
 sint32 win1252_to_utf8(utf8string dst, const char *src, size_t srcLength, size_t maxBufferLength);

-sint32 rct2_to_utf8(utf8 *dst, const char *src);
-sint32 utf8_to_rct2(char *dst, const utf8 *src);
 wchar_t encoding_convert_rct2_to_unicode(wchar_t rct2str);
 uint32 encoding_convert_unicode_to_rct2(uint32 unicode);
+
+#ifndef _WIN32
+
 wchar_t encoding_convert_gb2312_to_unicode(wchar_t gb2312);
 wchar_t encoding_convert_big5_to_unicode(wchar_t big5);
 wchar_t encoding_convert_cp932_to_unicode(wchar_t cp932);
 wchar_t encoding_convert_cp949_to_unicode(wchar_t cp949);

+#endif
+
 #define MAX_USER_STRINGS 1024
 #define USER_STRING_MAX_LENGTH 32

--- a/src/openrct2/object/StringTable.cpp
+++ b/src/openrct2/object/StringTable.cpp
@@ -68,13 +68,13 @@ void StringTable::Read(IReadObjectContext * context, IStream * stream, uint8 id)
            entry.LanguageId = languageId;

            std::string stringAsWin1252 = stream->ReadStdString();
-            utf8 * stringAsUtf8 = rct2_language_string_to_utf8(stringAsWin1252.c_str(), stringAsWin1252.size(), rct2LanguageId);
+            auto stringAsUtf8 = rct2_to_utf8(stringAsWin1252, rct2LanguageId);

-            if (StringIsBlank(stringAsUtf8))
+            if (StringIsBlank(stringAsUtf8.data()))
            {
                entry.LanguageId = LANGUAGE_UNDEFINED;
            }
-            String::Trim(stringAsUtf8);
+            stringAsUtf8 = String::Trim(stringAsUtf8);

            entry.Text = stringAsUtf8;
            _strings.push_back(entry);
--- a/src/openrct2/rct1/S4Importer.cpp
+++ b/src/openrct2/rct1/S4Importer.cpp
@@ -271,10 +271,7 @@ public:
            dst->objective_arg_2 = _s4.scenario_objective_currency;
        dst->objective_arg_3 = _s4.scenario_objective_num_guests;

-        utf8 utf8name[256];
-        rct2_to_utf8(utf8name, _s4.scenario_name);
-
-        std::string name = std::string(utf8name, sizeof(utf8name));
+        auto name = rct2_to_utf8(_s4.scenario_name, RCT2_LANGUAGE_ID_ENGLISH_UK);
        std::string details;

        // TryGetById won't set this property if the scenario is not recognised,
@@ -2759,10 +2756,8 @@ private:

    std::string GetUserString(rct_string_id stringId)
    {
-        utf8 buffer[128] = { 0 };
        const char * originalString = _s4.string_table[(stringId - USER_STRING_START) % 1024];
-        rct2_to_utf8(buffer, originalString);
-        return std::string(buffer);
+        return rct2_to_utf8(originalString, RCT2_LANGUAGE_ID_ENGLISH_UK);
    }

    void FixLandOwnership()
--- a/src/openrct2/rct2/S6Exporter.cpp
+++ b/src/openrct2/rct2/S6Exporter.cpp
@@ -164,8 +164,14 @@ void S6Exporter::Export()
        log_error("Found %d disjoint null sprites", disjoint_sprites_count);
    }
    _s6.info = gS6Info;
-    utf8_to_rct2(_s6.info.name, gS6Info.name);
-    utf8_to_rct2(_s6.info.details, gS6Info.details);
+    {
+        auto temp = utf8_to_rct2(gS6Info.name);
+        safe_strcpy(_s6.info.name, temp.data(), sizeof(_s6.info.name));
+    }
+    {
+        auto temp = utf8_to_rct2(gS6Info.details);
+        safe_strcpy(_s6.info.details, temp.data(), sizeof(_s6.info.details));
+    }
    uint32 researchedTrackPiecesA[128];
    uint32 researchedTrackPiecesB[128];

--- a/src/openrct2/rct2/S6Importer.cpp
+++ b/src/openrct2/rct2/S6Importer.cpp
@@ -47,6 +47,7 @@
 #include "../scenario/Scenario.h"
 #include "../scenario/ScenarioRepository.h"
 #include "../util/SawyerCoding.h"
+#include "../util/Util.h"
 #include "../world/Climate.h"
 #include "../world/Entrance.h"
 #include "../world/MapAnimation.h"
@@ -200,8 +201,15 @@ public:

        // _s6.header
        gS6Info = _s6.info;
-        rct2_to_utf8(gS6Info.name, _s6.info.name);
-        rct2_to_utf8(gS6Info.details, _s6.info.details);
+
+        {
+            auto temp = rct2_to_utf8(_s6.info.name, RCT2_LANGUAGE_ID_ENGLISH_UK);
+            safe_strcpy(gS6Info.name, temp.data(), sizeof(gS6Info.name));
+        }
+        {
+            auto temp = rct2_to_utf8(_s6.info.details, RCT2_LANGUAGE_ID_ENGLISH_UK);
+            safe_strcpy(gS6Info.details, temp.data(), sizeof(gS6Info.details));
+        }

        gDateMonthsElapsed = _s6.elapsed_months;
        gDateMonthTicks    = _s6.current_day;
--- a/test/tests/CMakeLists.txt
+++ b/test/tests/CMakeLists.txt
@@ -76,6 +76,8 @@ set(COMMON_TEST_SOURCES
    "${ROOT_DIR}/src/openrct2/core/Guard.cpp"
    "${ROOT_DIR}/src/openrct2/core/String.cpp"
    "${ROOT_DIR}/src/openrct2/Diagnostic.cpp"
+    "${ROOT_DIR}/src/openrct2/localisation/ConversionTables.cpp"
+    "${ROOT_DIR}/src/openrct2/localisation/Convert.cpp"
    "${ROOT_DIR}/src/openrct2/localisation/FormatCodes.cpp"
    "${ROOT_DIR}/src/openrct2/localisation/UTF8.cpp"
    "${ROOT_DIR}/src/openrct2/util/Util.cpp"
@@ -139,6 +141,11 @@ add_executable(test_string ${STRING_TEST_SOURCES})
 target_link_libraries(test_string ${GTEST_LIBRARIES} test-common ${LDL} z)
 add_test(NAME string COMMAND test_string)

+# Localisation test
+set(STRING_TEST_SOURCES "${CMAKE_CURRENT_LIST_DIR}/Localisation.cpp")
+add_executable(test_localisation ${STRING_TEST_SOURCES})
+target_link_libraries(test_localisation ${GTEST_LIBRARIES} test-common ${LDL} z)
+add_test(NAME localisation COMMAND test_localisation)

 # Ride ratings test
 set(RIDE_RATINGS_TEST_SOURCES "${CMAKE_CURRENT_LIST_DIR}/RideRatings.cpp"
--- a/test/tests/Localisation.cpp
+++ b/test/tests/Localisation.cpp
@@ -0,0 +1,80 @@
+#include "helpers/StringHelpers.hpp"
+#include "openrct2/localisation/Localisation.h"
+#include <gtest/gtest.h>
+
+class Localisation : public testing::Test
+{
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Tests for rct2_to_utf8
+///////////////////////////////////////////////////////////////////////////////
+
+TEST_F(Localisation, RCT2_to_UTF8_UK)
+{
+    auto input = "The quick brown fox";
+    auto expected = u8"The quick brown fox";
+    auto actual = rct2_to_utf8(input, RCT2_LANGUAGE_ID_ENGLISH_UK);
+    ASSERT_EQ(expected, actual);
+}
+
+TEST_F(Localisation, RCT2_to_UTF8_JP)
+{
+    auto input = StringFromHex("ff8374ff8340ff8358ff8367ff8375ff8389ff8345ff8393ff8374ff8348ff8362ff834eff8358");
+    auto expected = u8"ファストブラウンフォックス";
+    auto actual = rct2_to_utf8(input, RCT2_LANGUAGE_ID_JAPANESE);
+    ASSERT_EQ(expected, actual);
+}
+
+TEST_F(Localisation, RCT2_to_UTF8_ZH_TW)
+{
+    auto input = StringFromHex("ffa7d6ffb374ffaabaffb4c4ffa6e2ffaab0ffaf57");
+    auto expected = u8"快速的棕色狐狸";
+    auto actual = rct2_to_utf8(input, RCT2_LANGUAGE_ID_CHINESE_TRADITIONAL);
+    ASSERT_EQ(expected, actual);
+}
+
+TEST_F(Localisation, RCT2_to_UTF8_PL)
+{
+    auto input = StringFromHex("47F372736b6120446ff76b692054e6637a6f7779");
+    auto expected = u8"Górska Dołki Tęczowy";
+    auto actual = rct2_to_utf8(input, RCT2_LANGUAGE_ID_ENGLISH_UK);
+    ASSERT_EQ(expected, actual);
+}
+
+TEST_F(Localisation, RCT2_to_UTF8_ZH_TW_PREMATURE_END)
+{
+    // This string can be found in BATFL.DAT, the last double byte character is missing its second byte.
+    auto input = StringFromHex("ffa470ffabacffa8aeffbdf8ffa662ffc54bffb944ffa457ffaeb6ffb0caffb76effc2");
+    auto expected = u8"小型車輛在鐵道上振動搖";
+    auto actual = rct2_to_utf8(input, RCT2_LANGUAGE_ID_CHINESE_TRADITIONAL);
+    ASSERT_EQ(expected, actual);
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Tests for utf8_to_rct2
+///////////////////////////////////////////////////////////////////////////////
+
+TEST_F(Localisation, UTF8_to_RCT2_Basic)
+{
+    auto input = u8"à l'époque était";
+    auto expected = StringFromHex("e0206c27e9706f71756520e974616974");
+    auto actual = utf8_to_rct2(input);
+    ASSERT_EQ(expected, actual);
+}
+
+TEST_F(Localisation, UTF8_to_RCT2_ChineseTraditional)
+{
+    auto input = u8"$: 快速的棕色狐狸";
+    auto expected = StringFromHex("243a20ff5febff901fff7684ff68d5ff8272ff72d0ff72f8");
+    auto actual = utf8_to_rct2(input);
+    ASSERT_EQ(expected, actual);
+}
+
+TEST_F(Localisation, UTF8_to_RCT2_PL)
+{
+    auto input = u8"Górska Dołki Tęczowy";
+    auto expected = StringFromHex("47F372736b6120446ff76b692054e6637a6f7779");
+    auto actual = utf8_to_rct2(input);
+    ASSERT_EQ(expected, actual);
+}
--- a/test/tests/StringTest.cpp
+++ b/test/tests/StringTest.cpp
@@ -4,6 +4,7 @@
 #include <gtest/gtest.h>
 #include <openrct2/core/String.hpp>
 #include "AssertHelpers.hpp"
+#include "helpers/StringHelpers.hpp"

 using TCase = std::tuple<std::string, std::string, std::string>;

@@ -11,6 +12,10 @@ class StringTest : public testing::TestWithParam<TCase>
 {
 };

+///////////////////////////////////////////////////////////////////////////////
+// Tests for String::Trim
+///////////////////////////////////////////////////////////////////////////////
+
 INSTANTIATE_TEST_CASE_P(TrimData, StringTest, testing::Values(
    // input                      after Trim       after TrimStart
    TCase("string",               "string",        "string"),
@@ -45,6 +50,10 @@ TEST_P(StringTest, TrimStart)
    ASSERT_EQ(expected, actual);
 }

+///////////////////////////////////////////////////////////////////////////////
+// Tests for String::Split
+///////////////////////////////////////////////////////////////////////////////
+
 TEST_F(StringTest, Split_ByComma)
 {
    auto actual = String::Split("a,bb,ccc,dd", ",");
@@ -64,3 +73,44 @@ TEST_F(StringTest, Split_ByEmpty)
 {
    EXPECT_THROW(String::Split("string", ""), std::invalid_argument);
 }
+
+///////////////////////////////////////////////////////////////////////////////
+// Tests for String::Convert
+///////////////////////////////////////////////////////////////////////////////
+
+// TODO Remove when String::Convert is implemented for non-Windows platforms
+#ifdef _WIN32
+
+TEST_F(StringTest, Convert_950_to_UTF8)
+{
+    auto input = StringFromHex("a7d6b374aabab4c4a6e2aab0af57");
+    auto expected = u8"快速的棕色狐狸";
+    auto actual = String::Convert(input, CODE_PAGE::CP_950, CODE_PAGE::CP_UTF8);
+    ASSERT_EQ(expected, actual);
+}
+
+TEST_F(StringTest, Convert_UTF8_to_932)
+{
+    auto input = u8"ファストブラウンフォックス";
+    auto expected = StringFromHex("83748340835883678375838983458393837483488362834e8358");
+    auto actual = String::Convert(input, CODE_PAGE::CP_UTF8, CODE_PAGE::CP_932);
+    ASSERT_EQ(expected, actual);
+}
+
+TEST_F(StringTest, Convert_UTF8_to_UTF8)
+{
+    auto input = u8"سريع|brown|ثعلب";
+    auto expected = input;
+    auto actual = String::Convert(input, CODE_PAGE::CP_UTF8, CODE_PAGE::CP_UTF8);
+    ASSERT_EQ(expected, actual);
+}
+
+TEST_F(StringTest, Convert_Empty)
+{
+    auto input = "";
+    auto expected = input;
+    auto actual = String::Convert(input, CODE_PAGE::CP_1252, CODE_PAGE::CP_UTF8);
+    ASSERT_EQ(expected, actual);
+}
+
+#endif
--- a/test/tests/helpers/StringHelpers.hpp
+++ b/test/tests/helpers/StringHelpers.hpp
@@ -0,0 +1,17 @@
+#include <cassert>
+#include <string>
+#include <string_view>
+
+inline std::string StringFromHex(const std::string_view& input)
+{
+    assert((input.size() & 1) == 0);
+
+    std::string result;
+    result.reserve(input.size() / 2);
+    for (size_t i = 0; i < input.size(); i += 2)
+    {
+        auto val = std::stoi(std::string(input.substr(i, 2)), 0, 16);
+        result.push_back(val);
+    }
+    return result;
+}
--- a/test/tests/tests.vcxproj
+++ b/test/tests/tests.vcxproj
@@ -52,12 +52,14 @@
  <!-- Files -->
  <ItemGroup>
    <ClInclude Include="AssertHelpers.hpp" />
+    <ClInclude Include="helpers\StringHelpers.hpp" />
    <ClInclude Include="TestData.h" />
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="LanguagePackTest.cpp" />
    <ClCompile Include="IniReaderTest.cpp" />
    <ClCompile Include="IniWriterTest.cpp" />
+    <ClCompile Include="Localisation.cpp" />
    <ClCompile Include="MultiLaunch.cpp" />
    <ClCompile Include="RideRatings.cpp" />
    <ClCompile Include="sawyercoding_test.cpp" />