diff --git a/.travis.yml b/.travis.yml index f901d56902..0ed118e094 100644 --- a/.travis.yml +++ b/.travis.yml @@ -92,7 +92,7 @@ matrix: - docker - os: osx if: type != cron - osx_image: xcode8.3 + osx_image: xcode9.3 env: - secure: "OXn/i72FxW/oh6RGlaN+gHSbkt1ToFe36etaiDOsJQznt6fe9CpFdnE8U1XBHlGokcEjbGNErRU7CFDKYHQuGrPZyHXwgqG2/0emIqFaFt5ti5ypyYKf5qH9x1LLLfdZxDyHkxXdlJ7Etxbp3G7qrV8CGRQiYRNHm1f98AmuufE=" after_success: diff --git a/CMakeLists.txt b/CMakeLists.txt index 424d32b5b1..9329b27f3e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -127,7 +127,7 @@ else () set(PIE_FLAG "-fpie") endif () - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++14") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=gnu++17") endif () # Defines diff --git a/src/openrct2/Game.cpp b/src/openrct2/Game.cpp index 5209b4f8c2..b598f69958 100644 --- a/src/openrct2/Game.cpp +++ b/src/openrct2/Game.cpp @@ -1103,11 +1103,10 @@ static void load_landscape() void utf8_to_rct2_self(char * buffer, size_t length) { - char tempBuffer[512]; - utf8_to_rct2(tempBuffer, buffer); + auto temp = utf8_to_rct2(buffer); size_t i = 0; - const char * src = tempBuffer; + const char * src = temp.data(); char * dst = buffer; while (*src != 0 && i < length - 1) { @@ -1143,9 +1142,8 @@ void rct2_to_utf8_self(char * buffer, size_t length) { if (length > 0) { - char tempBuffer[512]; - rct2_to_utf8(tempBuffer, buffer); - safe_strcpy(buffer, tempBuffer, length); + auto temp = rct2_to_utf8(buffer, RCT2_LANGUAGE_ID_ENGLISH_UK); + safe_strcpy(buffer, temp.data(), length); } } diff --git a/src/openrct2/core/IStream.cpp b/src/openrct2/core/IStream.cpp index 27d298baca..cb9b8d1324 100644 --- a/src/openrct2/core/IStream.cpp +++ b/src/openrct2/core/IStream.cpp @@ -37,17 +37,13 @@ utf8 * IStream::ReadString() std::string IStream::ReadStdString() { - std::vector result; - + std::string result; uint8 ch; while ((ch = ReadValue()) != 0) { result.push_back(ch); } - result.push_back(0); - - std::string resultString(result.data(), result.data() + result.size()); - return resultString; + return result; } void IStream::WriteString(const utf8 * str) diff --git a/src/openrct2/core/String.cpp b/src/openrct2/core/String.cpp index d9742c4436..64464eefc3 100644 --- a/src/openrct2/core/String.cpp +++ b/src/openrct2/core/String.cpp @@ -18,6 +18,14 @@ #include #include +#ifdef _WIN32 +#ifndef NOMINMAX +#define NOMINMAX +#endif +#define WIN32_LEAN_AND_MEAN +#include +#endif + #include "../localisation/Language.h" #include "../util/Util.h" @@ -517,4 +525,32 @@ namespace String size_t stringLength = endSubstr - startSubstr + 1; return std::string(startSubstr, stringLength); } + + std::string Convert(const std::string_view& src, sint32 srcCodePage, sint32 dstCodePage) + { +#ifdef _WIN32 + // Convert from source code page to UTF-16 + std::wstring u16; + { + int srcLen = (int)src.size(); + int sizeReq = MultiByteToWideChar(srcCodePage, 0, src.data(), srcLen, nullptr, 0); + u16 = std::wstring(sizeReq, 0); + MultiByteToWideChar(srcCodePage, 0, src.data(), srcLen, u16.data(), sizeReq); + } + + // Convert from UTF-16 to destination code page + std::string dst; + { + int srcLen = (int)u16.size(); + int sizeReq = WideCharToMultiByte(dstCodePage, 0, u16.data(), srcLen, nullptr, 0, nullptr, nullptr); + dst = std::string(sizeReq, 0); + WideCharToMultiByte(dstCodePage, 0, u16.data(), srcLen, dst.data(), sizeReq, nullptr, nullptr); + } + + return dst; +#else + STUB(); + return std::string(src); +#endif + } } diff --git a/src/openrct2/core/String.hpp b/src/openrct2/core/String.hpp index 3a5ed93836..cbb89d4d3d 100644 --- a/src/openrct2/core/String.hpp +++ b/src/openrct2/core/String.hpp @@ -22,6 +22,19 @@ #include #include "../common.h" +namespace CODE_PAGE +{ + // windows.h defines CP_UTF8 +#undef CP_UTF8 + + constexpr sint32 CP_932 = 932; // ANSI/OEM Japanese; Japanese (Shift-JIS) + constexpr sint32 CP_936 = 936; // ANSI/OEM Simplified Chinese (PRC, Singapore); Chinese Simplified (GB2312) + constexpr sint32 CP_949 = 949; // ANSI/OEM Korean (Unified Hangul Code) + constexpr sint32 CP_950 = 950; // ANSI/OEM Traditional Chinese (Taiwan; Hong Kong SAR, PRC); Chinese Traditional (Big5) + constexpr sint32 CP_1252 = 1252; // ANSI Latin 1; Western European (Windows) + constexpr sint32 CP_UTF8 = 65001; // Unicode (UTF-8) +} + namespace String { constexpr const utf8 * Empty = ""; @@ -91,4 +104,9 @@ namespace String utf8 * TrimStart(utf8 * buffer, size_t bufferSize, const utf8 * src); std::string TrimStart(const std::string &s); std::string Trim(const std::string &s); + + /** + * Converts a multi-byte string from one code page to another. + */ + std::string Convert(const std::string_view& src, sint32 srcCodePage, sint32 dstCodePage); } diff --git a/src/openrct2/drawing/Font.cpp b/src/openrct2/drawing/Font.cpp index 45d2507275..95bb57453e 100644 --- a/src/openrct2/drawing/Font.cpp +++ b/src/openrct2/drawing/Font.cpp @@ -15,7 +15,6 @@ #pragma endregion #include "../core/Util.hpp" -#include "../localisation/ConversionTables.h" #include "../localisation/FormatCodes.h" #include "../localisation/Language.h" #include "../sprites.h" diff --git a/src/openrct2/localisation/ConversionTables.cpp b/src/openrct2/localisation/ConversionTables.cpp index bae693878e..ef4e4f7687 100644 --- a/src/openrct2/localisation/ConversionTables.cpp +++ b/src/openrct2/localisation/ConversionTables.cpp @@ -14,8 +14,16 @@ *****************************************************************************/ #pragma endregion -#include "ConversionTables.h" +#include +#include "../core/Util.hpp" #include "FormatCodes.h" +#include "Localisation.h" + +struct encoding_convert_entry +{ + uint16 code; + uint32 unicode; +}; // clang-format off const encoding_convert_entry RCT2ToUnicodeTable[256] = @@ -277,7 +285,44 @@ const encoding_convert_entry RCT2ToUnicodeTable[256] = { RCT2_Z_ACUTE, UNICODE_Z_ACUTE }, { 255, 255 } }; - + +static sint32 encoding_search_compare(const void *pKey, const void *pEntry) +{ + uint16 key = *((uint16*)pKey); + encoding_convert_entry *entry = (encoding_convert_entry*)pEntry; + if (key < entry->code) return -1; + if (key > entry->code) return 1; + return 0; +} + +static wchar_t encoding_convert_x_to_unicode(wchar_t code, const encoding_convert_entry *table, size_t count) +{ + encoding_convert_entry * entry = (encoding_convert_entry *)std::bsearch(&code, table, count, sizeof(encoding_convert_entry), encoding_search_compare); + if (entry == nullptr) return code; + else return entry->unicode; +} + +wchar_t encoding_convert_rct2_to_unicode(wchar_t rct2str) +{ + return encoding_convert_x_to_unicode(rct2str, RCT2ToUnicodeTable, Util::CountOf(RCT2ToUnicodeTable)); +} + +uint32 encoding_convert_unicode_to_rct2(uint32 unicode) +{ + // Can't do a binary search as it's sorted by RCT2 code, not unicode + for (const auto& entry : RCT2ToUnicodeTable) + { + if (entry.unicode == unicode) + { + return entry.code; + } + } + return unicode; +} + + +#ifndef _WIN32 + const encoding_convert_entry GB2312ToUnicodeTable[7445] = { { 8481, 12288 }, @@ -46295,3 +46340,25 @@ const encoding_convert_entry CP949ToUnicodeTable[17176] = { 0xFDFE, 0x8A70 }, // CJK UNIFIED IDEOGRAPH }; //clang-format on + +wchar_t encoding_convert_gb2312_to_unicode(wchar_t gb2312) +{ + return encoding_convert_x_to_unicode(gb2312 - 0x8080, GB2312ToUnicodeTable, Util::CountOf(GB2312ToUnicodeTable)); +} + +wchar_t encoding_convert_big5_to_unicode(wchar_t big5) +{ + return encoding_convert_x_to_unicode(big5, Big5ToUnicodeTable, Util::CountOf(Big5ToUnicodeTable)); +} + +wchar_t encoding_convert_cp932_to_unicode(wchar_t cp932) +{ + return encoding_convert_x_to_unicode(cp932, CP932ToUnicodeTable, Util::CountOf(CP932ToUnicodeTable)); +} + +wchar_t encoding_convert_cp949_to_unicode(wchar_t cp949) +{ + return encoding_convert_x_to_unicode(cp949, CP949ToUnicodeTable, Util::CountOf(CP949ToUnicodeTable)); +} + +#endif diff --git a/src/openrct2/localisation/ConversionTables.h b/src/openrct2/localisation/ConversionTables.h deleted file mode 100644 index 711edeffbd..0000000000 --- a/src/openrct2/localisation/ConversionTables.h +++ /dev/null @@ -1,73 +0,0 @@ -#pragma region Copyright (c) 2014-2018 OpenRCT2 Developers -/***************************************************************************** - * OpenRCT2, an open source clone of Roller Coaster Tycoon 2. - * - * OpenRCT2 is the work of many authors, a full list can be found in contributors.md - * For more information, visit https://github.com/OpenRCT2/OpenRCT2 - * - * OpenRCT2 is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * A full copy of the GNU General Public License can be found in licence.txt - *****************************************************************************/ -#pragma endregion - -#pragma once - -#include "../common.h" - -struct encoding_convert_entry -{ - uint16 code; - uint32 unicode; -}; - -extern const encoding_convert_entry GB2312ToUnicodeTable[7445]; -extern const encoding_convert_entry Big5ToUnicodeTable[13710]; -extern const encoding_convert_entry RCT2ToUnicodeTable[256]; -extern const encoding_convert_entry CP932ToUnicodeTable[7916]; -extern const encoding_convert_entry CP949ToUnicodeTable[17176]; - -enum RCT2Polish -{ - RCT2_A_OGONEK_UC = 159, // 0x9F - RCT2_C_ACUTE_UC = 162, // 0xA2 - RCT2_E_OGONEK_UC = 166, // 0xA6 - RCT2_N_ACUTE_UC = 198, // 0xC6 - RCT2_L_STROKE_UC = 167, // 0xA7 - RCT2_S_ACUTE_UC = 208, // 0xD0 - RCT2_Z_DOT_UC = 216, // 0xD8 - RCT2_Z_ACUTE_UC = 215, // 0xD7 - - RCT2_A_OGONEK = 221, // 0xDD - RCT2_C_ACUTE = 222, // 0xDE - RCT2_E_OGONEK = 230, // 0xE6 - RCT2_N_ACUTE = 240, // 0xF0 - RCT2_L_STROKE = 247, // 0xF7 - RCT2_S_ACUTE = 248, // 0xF8 - RCT2_Z_DOT = 253, // 0xFD - RCT2_Z_ACUTE = 254, // 0xFE -}; - -enum UnicodePolish -{ - UNICODE_A_OGONEK_UC = 260, - UNICODE_C_ACUTE_UC = 262, - UNICODE_E_OGONEK_UC = 280, - UNICODE_N_ACUTE_UC = 323, - UNICODE_L_STROKE_UC = 321, - UNICODE_S_ACUTE_UC = 346, - UNICODE_Z_DOT_UC = 379, - UNICODE_Z_ACUTE_UC = 377, - - UNICODE_A_OGONEK = 261, - UNICODE_C_ACUTE = 263, - UNICODE_E_OGONEK = 281, - UNICODE_N_ACUTE = 324, - UNICODE_L_STROKE = 322, - UNICODE_S_ACUTE = 347, - UNICODE_Z_DOT = 380, - UNICODE_Z_ACUTE = 378, -}; diff --git a/src/openrct2/localisation/Convert.cpp b/src/openrct2/localisation/Convert.cpp index 786736c483..b6d789908c 100644 --- a/src/openrct2/localisation/Convert.cpp +++ b/src/openrct2/localisation/Convert.cpp @@ -1,4 +1,4 @@ -#pragma region Copyright (c) 2014-2017 OpenRCT2 Developers +#pragma region Copyright (c) 2014-2018 OpenRCT2 Developers /***************************************************************************** * OpenRCT2, an open source clone of Roller Coaster Tycoon 2. * @@ -14,101 +14,182 @@ *****************************************************************************/ #pragma endregion +#include +#include +#include +#include "../core/String.hpp" #include "../core/Util.hpp" -#include "ConversionTables.h" #include "Localisation.h" -sint32 rct2_to_utf8(utf8 *dst, const char *src) +/** + * Decodes an RCT2 string to a wide char string still in the original code page. + * An RCT2 string is a multi-byte string where every two-byte code point is preceeded with a byte value of 255. + */ +static std::wstring DecodeToWideChar(const std::string_view& src) { - wchar_t codepoint; + std::wstring decoded; + decoded.reserve(src.size()); + for (auto it = src.begin(); it != src.end(); ) + { + uint8_t c = *it++; + if (c == 255) + { + // Push next two characters + uint8 a = 0; + uint8 b = 0; + if (it != src.end()) + { + a = *it++; + if (it != src.end()) + { + b = *it++; + } + else + { + // 2nd byte for double byte character is missing + break; + } + } + else + { + // 1st byte for double byte character is missing + break; + } - utf8 *start = dst; - const char *ch = src; - while (*ch != 0) { - if (*ch == (char)(uint8)0xFF) { - ch++; - - // Read wide char - uint8 a = *ch++; - uint8 b = *ch++; - codepoint = (a << 8) | b; - } else { - codepoint = (uint8)(*ch++); - codepoint = encoding_convert_rct2_to_unicode(codepoint); + wchar_t cp = (a << 8) | b; + decoded.push_back(cp); + } + else + { + // Push character + decoded.push_back(c); } - - dst = utf8_write_codepoint(dst, codepoint); } - dst = utf8_write_codepoint(dst, 0); - return (sint32)(dst - start); + return decoded; } -sint32 utf8_to_rct2(char *dst, const utf8 *src) +static std::string DecodeToMultiByte(const std::string_view& src) { - char *start = dst; - const utf8 *ch = src; + auto wide = DecodeToWideChar(src); + std::string result; + result.reserve(wide.size()); + for (auto cc : wide) + { + if (cc <= 255) + { + result.push_back(cc); + } + else + { + result.push_back((cc >> 8) & 0xFF); + result.push_back(cc & 0xFF); + } + } + return result; +} + +/** + * Encodes a UTF-8 string as an RCT2 string. + */ +static std::string Encode(const std::string_view& src) +{ + std::string dst; + const utf8 * ch = src.data(); sint32 codepoint; - while ((codepoint = utf8_get_next(ch, &ch)) != 0) { + while ((codepoint = utf8_get_next(ch, &ch)) != 0) + { codepoint = encoding_convert_unicode_to_rct2(codepoint); - if (codepoint < 256) { - *dst++ = (char)codepoint; - } else if (codepoint <= 0xFFFF) { - *dst++ = (char)(uint8)0xFF; - *dst++ = (codepoint >> 8) & 0xFF; - *dst++ = codepoint & 0xFF; + if (codepoint <= std::numeric_limits::max()) + { + dst.push_back(codepoint); + } + else if (codepoint <= std::numeric_limits::max()) + { + dst.push_back((char)(uint8)0xFF); + dst.push_back((codepoint >> 8) & 0xFF); + dst.push_back(codepoint & 0xFF); + } + else + { + // RCT2 strings do not support code points greater than 65535, replace them with '?' + dst.push_back('?'); } } - *dst++ = 0; - return (sint32)(dst - start); + return dst; } -static sint32 encoding_search_compare(const void *pKey, const void *pEntry) +static sint32 GetCodePageForRCT2Language(RCT2LanguageId languageId) { - uint16 key = *((uint16*)pKey); - encoding_convert_entry *entry = (encoding_convert_entry*)pEntry; - if (key < entry->code) return -1; - if (key > entry->code) return 1; - return 0; -} - -static wchar_t encoding_convert_x_to_unicode(wchar_t code, const encoding_convert_entry *table, size_t count) -{ - encoding_convert_entry * entry = (encoding_convert_entry *)bsearch(&code, table, count, sizeof(encoding_convert_entry), encoding_search_compare); - if (entry == nullptr) return code; - else return entry->unicode; -} - -uint32 encoding_convert_unicode_to_rct2(uint32 unicode) -{ - // Can't do a binary search as it's sorted by RCT2 code, not unicode - for (uint32 i = 0; i < Util::CountOf(RCT2ToUnicodeTable); i++) { - if (RCT2ToUnicodeTable[i].unicode == unicode) return RCT2ToUnicodeTable[i].code; + switch (languageId) + { + case RCT2_LANGUAGE_ID_JAPANESE: + return CODE_PAGE::CP_932; + case RCT2_LANGUAGE_ID_CHINESE_SIMPLIFIED: + return CODE_PAGE::CP_936; + case RCT2_LANGUAGE_ID_KOREAN: + return CODE_PAGE::CP_949; + case RCT2_LANGUAGE_ID_CHINESE_TRADITIONAL: + return CODE_PAGE::CP_950; + default: + return CODE_PAGE::CP_1252; } - return unicode; } -wchar_t encoding_convert_rct2_to_unicode(wchar_t rct2str) +template +static std::string DecodeConvertWithTable(const std::string_view& src, TConvertFunc func) { - return encoding_convert_x_to_unicode(rct2str, RCT2ToUnicodeTable, Util::CountOf(RCT2ToUnicodeTable)); + auto decoded = DecodeToWideChar(src); + std::wstring u16; + u16.reserve(decoded.size()); + for (auto cc : decoded) + { + u16.push_back(func(cc)); + } + return String::ToUtf8(u16); } -wchar_t encoding_convert_gb2312_to_unicode(wchar_t gb2312) +std::string rct2_to_utf8(const std::string_view& src, RCT2LanguageId languageId) { - return encoding_convert_x_to_unicode(gb2312 - 0x8080, GB2312ToUnicodeTable, Util::CountOf(GB2312ToUnicodeTable)); + auto codePage = GetCodePageForRCT2Language(languageId); + + std::string result; + switch (codePage) + { + case CODE_PAGE::CP_1252: + // The code page used by RCT2 was not quite 1252 as some codes were used for Polish characters. + result = DecodeConvertWithTable(src, encoding_convert_rct2_to_unicode); + break; + +#ifdef _WIN32 + default: + auto decoded = DecodeToMultiByte(src); + result = String::Convert(decoded, codePage, CODE_PAGE::CP_UTF8); +#else + // TODO Change this to use a library such as libicu + case CODE_PAGE::CP_932: + result = DecodeConvertWithTable(src, encoding_convert_cp932_to_unicode); + break; + case CODE_PAGE::CP_936: + result = DecodeConvertWithTable(src, encoding_convert_gb2312_to_unicode); + break; + case CODE_PAGE::CP_949: + result = DecodeConvertWithTable(src, encoding_convert_cp949_to_unicode); + break; + case CODE_PAGE::CP_950: + result = DecodeConvertWithTable(src, encoding_convert_big5_to_unicode); + break; + default: + throw std::runtime_error("Unsupported code page: " + std::to_string(codePage)); + break; +#endif + } + return result; } -wchar_t encoding_convert_big5_to_unicode(wchar_t big5) +std::string utf8_to_rct2(const std::string_view& src) { - return encoding_convert_x_to_unicode(big5, Big5ToUnicodeTable, Util::CountOf(Big5ToUnicodeTable)); + // NOTE: This is only used for SC6 / SV6 files which don't store the language identifier + // because of this, we can only store in RCT2's CP_1252 format. We can preserve some + // unicode characters, but only those between 256 and 65535. + return Encode(src); } - -wchar_t encoding_convert_cp932_to_unicode(wchar_t cp932) -{ - return encoding_convert_x_to_unicode(cp932, CP932ToUnicodeTable, Util::CountOf(CP932ToUnicodeTable)); -} - -wchar_t encoding_convert_cp949_to_unicode(wchar_t cp949) -{ - return encoding_convert_x_to_unicode(cp949, CP949ToUnicodeTable, Util::CountOf(CP949ToUnicodeTable)); -} - diff --git a/src/openrct2/localisation/FormatCodes.h b/src/openrct2/localisation/FormatCodes.h index e7d0d518cf..4789c86dda 100644 --- a/src/openrct2/localisation/FormatCodes.h +++ b/src/openrct2/localisation/FormatCodes.h @@ -139,4 +139,46 @@ enum { FORMAT_COMMA1DP16 = 20004 }; +enum RCT2Polish +{ + RCT2_A_OGONEK_UC = 159, // 0x9F + RCT2_C_ACUTE_UC = 162, // 0xA2 + RCT2_E_OGONEK_UC = 166, // 0xA6 + RCT2_N_ACUTE_UC = 198, // 0xC6 + RCT2_L_STROKE_UC = 167, // 0xA7 + RCT2_S_ACUTE_UC = 208, // 0xD0 + RCT2_Z_DOT_UC = 216, // 0xD8 + RCT2_Z_ACUTE_UC = 215, // 0xD7 + + RCT2_A_OGONEK = 221, // 0xDD + RCT2_C_ACUTE = 222, // 0xDE + RCT2_E_OGONEK = 230, // 0xE6 + RCT2_N_ACUTE = 240, // 0xF0 + RCT2_L_STROKE = 247, // 0xF7 + RCT2_S_ACUTE = 248, // 0xF8 + RCT2_Z_DOT = 253, // 0xFD + RCT2_Z_ACUTE = 254, // 0xFE +}; + +enum UnicodePolish +{ + UNICODE_A_OGONEK_UC = 260, + UNICODE_C_ACUTE_UC = 262, + UNICODE_E_OGONEK_UC = 280, + UNICODE_N_ACUTE_UC = 323, + UNICODE_L_STROKE_UC = 321, + UNICODE_S_ACUTE_UC = 346, + UNICODE_Z_DOT_UC = 379, + UNICODE_Z_ACUTE_UC = 377, + + UNICODE_A_OGONEK = 261, + UNICODE_C_ACUTE = 263, + UNICODE_E_OGONEK = 281, + UNICODE_N_ACUTE = 324, + UNICODE_L_STROKE = 322, + UNICODE_S_ACUTE = 347, + UNICODE_Z_DOT = 380, + UNICODE_Z_ACUTE = 378, +}; + #endif diff --git a/src/openrct2/localisation/Language.cpp b/src/openrct2/localisation/Language.cpp index a419f8e697..792a5cba2a 100644 --- a/src/openrct2/localisation/Language.cpp +++ b/src/openrct2/localisation/Language.cpp @@ -176,79 +176,6 @@ void language_close_all() constexpr rct_string_id NONSTEX_BASE_STRING_ID = 3463; constexpr uint16 MAX_OBJECT_CACHED_STRINGS = 2048; -static wchar_t convert_specific_language_character_to_unicode(RCT2LanguageId languageId, wchar_t codepoint) -{ - switch (languageId) { - case RCT2_LANGUAGE_ID_CHINESE_TRADITIONAL: - return encoding_convert_big5_to_unicode(codepoint); - case RCT2_LANGUAGE_ID_CHINESE_SIMPLIFIED: - return encoding_convert_gb2312_to_unicode(codepoint); - case RCT2_LANGUAGE_ID_JAPANESE: - return encoding_convert_cp932_to_unicode(codepoint); - case RCT2_LANGUAGE_ID_KOREAN: - return encoding_convert_cp949_to_unicode(codepoint); - default: - return codepoint; - } -} - -static utf8 * convert_multibyte_charset(const char * src, size_t srcMaxSize, RCT2LanguageId languageId) -{ - constexpr char CODEPOINT_DOUBLEBYTE = (char)(uint8)0xFF; - - auto sb = StringBuilder(64); - for (const char * ch = src; (ch < src + srcMaxSize) && (*ch != '\0');) - { - if (*ch == CODEPOINT_DOUBLEBYTE) - { - ch++; - if (ch < src + srcMaxSize) - { - uint8 a = *ch++; - if (a != '\0') - { - uint8 b = *ch++; - wchar_t codepoint16 = (wchar_t)((a << 8) | b); - - codepoint16 = convert_specific_language_character_to_unicode(languageId, codepoint16); - sb.Append(codepoint16); - } - } - } - else - { - codepoint_t codepoint = (uint8)*ch++; - sb.Append(codepoint); - } - } - return sb.StealString(); -} - -static bool rct2_language_is_multibyte_charset(RCT2LanguageId languageId) -{ - switch (languageId) { - case RCT2_LANGUAGE_ID_KOREAN: - case RCT2_LANGUAGE_ID_CHINESE_TRADITIONAL: - case RCT2_LANGUAGE_ID_CHINESE_SIMPLIFIED: - case RCT2_LANGUAGE_ID_JAPANESE: - return true; - default: - return false; - } -} - -utf8 * rct2_language_string_to_utf8(const char *src, size_t srcSize, RCT2LanguageId languageId) -{ - if (rct2_language_is_multibyte_charset(languageId)) - { - return convert_multibyte_charset(src, srcSize, languageId); - } - else - { - return win1252_to_utf8_alloc(src, srcSize); - } -} - bool language_get_localised_scenario_strings(const utf8 *scenarioFilename, rct_string_id *outStringIds) { outStringIds[0] = _languageCurrent->GetScenarioOverrideStringId(scenarioFilename, 0); diff --git a/src/openrct2/localisation/Language.h b/src/openrct2/localisation/Language.h index 2188080e0b..8725d8454c 100644 --- a/src/openrct2/localisation/Language.h +++ b/src/openrct2/localisation/Language.h @@ -18,6 +18,7 @@ #define _LANGUAGE_H_ #include +#include #include "../common.h" #include "../drawing/Font.h" @@ -109,7 +110,8 @@ sint32 utf8_length(const utf8 *text); wchar_t *utf8_to_widechar(const utf8 *src); utf8 *widechar_to_utf8(const wchar_t *src); -utf8 *rct2_language_string_to_utf8(const char *src, size_t srcSize, RCT2LanguageId languageId); +std::string rct2_to_utf8(const std::string_view& src, RCT2LanguageId languageId); +std::string utf8_to_rct2(const std::string_view& src); bool language_get_localised_scenario_strings(const utf8 *scenarioFilename, rct_string_id *outStringIds); void language_free_object_string(rct_string_id stringId); rct_string_id language_get_object_override_string_id(const char * identifier, uint8 index); diff --git a/src/openrct2/localisation/Localisation.h b/src/openrct2/localisation/Localisation.h index fad7229e22..a1b01f223d 100644 --- a/src/openrct2/localisation/Localisation.h +++ b/src/openrct2/localisation/Localisation.h @@ -53,15 +53,18 @@ bool is_user_string_id(rct_string_id stringId); utf8 *win1252_to_utf8_alloc(const char *src, size_t srcMaxSize); sint32 win1252_to_utf8(utf8string dst, const char *src, size_t srcLength, size_t maxBufferLength); -sint32 rct2_to_utf8(utf8 *dst, const char *src); -sint32 utf8_to_rct2(char *dst, const utf8 *src); wchar_t encoding_convert_rct2_to_unicode(wchar_t rct2str); uint32 encoding_convert_unicode_to_rct2(uint32 unicode); + +#ifndef _WIN32 + wchar_t encoding_convert_gb2312_to_unicode(wchar_t gb2312); wchar_t encoding_convert_big5_to_unicode(wchar_t big5); wchar_t encoding_convert_cp932_to_unicode(wchar_t cp932); wchar_t encoding_convert_cp949_to_unicode(wchar_t cp949); +#endif + #define MAX_USER_STRINGS 1024 #define USER_STRING_MAX_LENGTH 32 diff --git a/src/openrct2/object/StringTable.cpp b/src/openrct2/object/StringTable.cpp index 521124a7ae..cc1a5568b5 100644 --- a/src/openrct2/object/StringTable.cpp +++ b/src/openrct2/object/StringTable.cpp @@ -68,13 +68,13 @@ void StringTable::Read(IReadObjectContext * context, IStream * stream, uint8 id) entry.LanguageId = languageId; std::string stringAsWin1252 = stream->ReadStdString(); - utf8 * stringAsUtf8 = rct2_language_string_to_utf8(stringAsWin1252.c_str(), stringAsWin1252.size(), rct2LanguageId); + auto stringAsUtf8 = rct2_to_utf8(stringAsWin1252, rct2LanguageId); - if (StringIsBlank(stringAsUtf8)) + if (StringIsBlank(stringAsUtf8.data())) { entry.LanguageId = LANGUAGE_UNDEFINED; } - String::Trim(stringAsUtf8); + stringAsUtf8 = String::Trim(stringAsUtf8); entry.Text = stringAsUtf8; _strings.push_back(entry); diff --git a/src/openrct2/rct1/S4Importer.cpp b/src/openrct2/rct1/S4Importer.cpp index d3510f5900..9b552f03f6 100644 --- a/src/openrct2/rct1/S4Importer.cpp +++ b/src/openrct2/rct1/S4Importer.cpp @@ -271,10 +271,7 @@ public: dst->objective_arg_2 = _s4.scenario_objective_currency; dst->objective_arg_3 = _s4.scenario_objective_num_guests; - utf8 utf8name[256]; - rct2_to_utf8(utf8name, _s4.scenario_name); - - std::string name = std::string(utf8name, sizeof(utf8name)); + auto name = rct2_to_utf8(_s4.scenario_name, RCT2_LANGUAGE_ID_ENGLISH_UK); std::string details; // TryGetById won't set this property if the scenario is not recognised, @@ -2759,10 +2756,8 @@ private: std::string GetUserString(rct_string_id stringId) { - utf8 buffer[128] = { 0 }; const char * originalString = _s4.string_table[(stringId - USER_STRING_START) % 1024]; - rct2_to_utf8(buffer, originalString); - return std::string(buffer); + return rct2_to_utf8(originalString, RCT2_LANGUAGE_ID_ENGLISH_UK); } void FixLandOwnership() diff --git a/src/openrct2/rct2/S6Exporter.cpp b/src/openrct2/rct2/S6Exporter.cpp index 112a524baf..fed8587bdc 100644 --- a/src/openrct2/rct2/S6Exporter.cpp +++ b/src/openrct2/rct2/S6Exporter.cpp @@ -164,8 +164,14 @@ void S6Exporter::Export() log_error("Found %d disjoint null sprites", disjoint_sprites_count); } _s6.info = gS6Info; - utf8_to_rct2(_s6.info.name, gS6Info.name); - utf8_to_rct2(_s6.info.details, gS6Info.details); + { + auto temp = utf8_to_rct2(gS6Info.name); + safe_strcpy(_s6.info.name, temp.data(), sizeof(_s6.info.name)); + } + { + auto temp = utf8_to_rct2(gS6Info.details); + safe_strcpy(_s6.info.details, temp.data(), sizeof(_s6.info.details)); + } uint32 researchedTrackPiecesA[128]; uint32 researchedTrackPiecesB[128]; diff --git a/src/openrct2/rct2/S6Importer.cpp b/src/openrct2/rct2/S6Importer.cpp index 8d1baa7f9a..631d801661 100644 --- a/src/openrct2/rct2/S6Importer.cpp +++ b/src/openrct2/rct2/S6Importer.cpp @@ -47,6 +47,7 @@ #include "../scenario/Scenario.h" #include "../scenario/ScenarioRepository.h" #include "../util/SawyerCoding.h" +#include "../util/Util.h" #include "../world/Climate.h" #include "../world/Entrance.h" #include "../world/MapAnimation.h" @@ -200,8 +201,15 @@ public: // _s6.header gS6Info = _s6.info; - rct2_to_utf8(gS6Info.name, _s6.info.name); - rct2_to_utf8(gS6Info.details, _s6.info.details); + + { + auto temp = rct2_to_utf8(_s6.info.name, RCT2_LANGUAGE_ID_ENGLISH_UK); + safe_strcpy(gS6Info.name, temp.data(), sizeof(gS6Info.name)); + } + { + auto temp = rct2_to_utf8(_s6.info.details, RCT2_LANGUAGE_ID_ENGLISH_UK); + safe_strcpy(gS6Info.details, temp.data(), sizeof(gS6Info.details)); + } gDateMonthsElapsed = _s6.elapsed_months; gDateMonthTicks = _s6.current_day; diff --git a/test/tests/CMakeLists.txt b/test/tests/CMakeLists.txt index c87ff62982..50ef66c92b 100644 --- a/test/tests/CMakeLists.txt +++ b/test/tests/CMakeLists.txt @@ -76,6 +76,8 @@ set(COMMON_TEST_SOURCES "${ROOT_DIR}/src/openrct2/core/Guard.cpp" "${ROOT_DIR}/src/openrct2/core/String.cpp" "${ROOT_DIR}/src/openrct2/Diagnostic.cpp" + "${ROOT_DIR}/src/openrct2/localisation/ConversionTables.cpp" + "${ROOT_DIR}/src/openrct2/localisation/Convert.cpp" "${ROOT_DIR}/src/openrct2/localisation/FormatCodes.cpp" "${ROOT_DIR}/src/openrct2/localisation/UTF8.cpp" "${ROOT_DIR}/src/openrct2/util/Util.cpp" @@ -139,6 +141,11 @@ add_executable(test_string ${STRING_TEST_SOURCES}) target_link_libraries(test_string ${GTEST_LIBRARIES} test-common ${LDL} z) add_test(NAME string COMMAND test_string) +# Localisation test +set(STRING_TEST_SOURCES "${CMAKE_CURRENT_LIST_DIR}/Localisation.cpp") +add_executable(test_localisation ${STRING_TEST_SOURCES}) +target_link_libraries(test_localisation ${GTEST_LIBRARIES} test-common ${LDL} z) +add_test(NAME localisation COMMAND test_localisation) # Ride ratings test set(RIDE_RATINGS_TEST_SOURCES "${CMAKE_CURRENT_LIST_DIR}/RideRatings.cpp" diff --git a/test/tests/Localisation.cpp b/test/tests/Localisation.cpp new file mode 100644 index 0000000000..611abb7a9e --- /dev/null +++ b/test/tests/Localisation.cpp @@ -0,0 +1,80 @@ +#include "helpers/StringHelpers.hpp" +#include "openrct2/localisation/Localisation.h" +#include + +class Localisation : public testing::Test +{ +}; + +/////////////////////////////////////////////////////////////////////////////// +// Tests for rct2_to_utf8 +/////////////////////////////////////////////////////////////////////////////// + +TEST_F(Localisation, RCT2_to_UTF8_UK) +{ + auto input = "The quick brown fox"; + auto expected = u8"The quick brown fox"; + auto actual = rct2_to_utf8(input, RCT2_LANGUAGE_ID_ENGLISH_UK); + ASSERT_EQ(expected, actual); +} + +TEST_F(Localisation, RCT2_to_UTF8_JP) +{ + auto input = StringFromHex("ff8374ff8340ff8358ff8367ff8375ff8389ff8345ff8393ff8374ff8348ff8362ff834eff8358"); + auto expected = u8"ファストブラウンフォックス"; + auto actual = rct2_to_utf8(input, RCT2_LANGUAGE_ID_JAPANESE); + ASSERT_EQ(expected, actual); +} + +TEST_F(Localisation, RCT2_to_UTF8_ZH_TW) +{ + auto input = StringFromHex("ffa7d6ffb374ffaabaffb4c4ffa6e2ffaab0ffaf57"); + auto expected = u8"快速的棕色狐狸"; + auto actual = rct2_to_utf8(input, RCT2_LANGUAGE_ID_CHINESE_TRADITIONAL); + ASSERT_EQ(expected, actual); +} + +TEST_F(Localisation, RCT2_to_UTF8_PL) +{ + auto input = StringFromHex("47F372736b6120446ff76b692054e6637a6f7779"); + auto expected = u8"Górska Dołki Tęczowy"; + auto actual = rct2_to_utf8(input, RCT2_LANGUAGE_ID_ENGLISH_UK); + ASSERT_EQ(expected, actual); +} + +TEST_F(Localisation, RCT2_to_UTF8_ZH_TW_PREMATURE_END) +{ + // This string can be found in BATFL.DAT, the last double byte character is missing its second byte. + auto input = StringFromHex("ffa470ffabacffa8aeffbdf8ffa662ffc54bffb944ffa457ffaeb6ffb0caffb76effc2"); + auto expected = u8"小型車輛在鐵道上振動搖"; + auto actual = rct2_to_utf8(input, RCT2_LANGUAGE_ID_CHINESE_TRADITIONAL); + ASSERT_EQ(expected, actual); +} + +/////////////////////////////////////////////////////////////////////////////// +// Tests for utf8_to_rct2 +/////////////////////////////////////////////////////////////////////////////// + +TEST_F(Localisation, UTF8_to_RCT2_Basic) +{ + auto input = u8"à l'époque était"; + auto expected = StringFromHex("e0206c27e9706f71756520e974616974"); + auto actual = utf8_to_rct2(input); + ASSERT_EQ(expected, actual); +} + +TEST_F(Localisation, UTF8_to_RCT2_ChineseTraditional) +{ + auto input = u8"$: 快速的棕色狐狸"; + auto expected = StringFromHex("243a20ff5febff901fff7684ff68d5ff8272ff72d0ff72f8"); + auto actual = utf8_to_rct2(input); + ASSERT_EQ(expected, actual); +} + +TEST_F(Localisation, UTF8_to_RCT2_PL) +{ + auto input = u8"Górska Dołki Tęczowy"; + auto expected = StringFromHex("47F372736b6120446ff76b692054e6637a6f7779"); + auto actual = utf8_to_rct2(input); + ASSERT_EQ(expected, actual); +} diff --git a/test/tests/StringTest.cpp b/test/tests/StringTest.cpp index d0d6c0951a..c7f9704e5e 100644 --- a/test/tests/StringTest.cpp +++ b/test/tests/StringTest.cpp @@ -4,6 +4,7 @@ #include #include #include "AssertHelpers.hpp" +#include "helpers/StringHelpers.hpp" using TCase = std::tuple; @@ -11,6 +12,10 @@ class StringTest : public testing::TestWithParam { }; +/////////////////////////////////////////////////////////////////////////////// +// Tests for String::Trim +/////////////////////////////////////////////////////////////////////////////// + INSTANTIATE_TEST_CASE_P(TrimData, StringTest, testing::Values( // input after Trim after TrimStart TCase("string", "string", "string"), @@ -45,6 +50,10 @@ TEST_P(StringTest, TrimStart) ASSERT_EQ(expected, actual); } +/////////////////////////////////////////////////////////////////////////////// +// Tests for String::Split +/////////////////////////////////////////////////////////////////////////////// + TEST_F(StringTest, Split_ByComma) { auto actual = String::Split("a,bb,ccc,dd", ","); @@ -64,3 +73,44 @@ TEST_F(StringTest, Split_ByEmpty) { EXPECT_THROW(String::Split("string", ""), std::invalid_argument); } + +/////////////////////////////////////////////////////////////////////////////// +// Tests for String::Convert +/////////////////////////////////////////////////////////////////////////////// + +// TODO Remove when String::Convert is implemented for non-Windows platforms +#ifdef _WIN32 + +TEST_F(StringTest, Convert_950_to_UTF8) +{ + auto input = StringFromHex("a7d6b374aabab4c4a6e2aab0af57"); + auto expected = u8"快速的棕色狐狸"; + auto actual = String::Convert(input, CODE_PAGE::CP_950, CODE_PAGE::CP_UTF8); + ASSERT_EQ(expected, actual); +} + +TEST_F(StringTest, Convert_UTF8_to_932) +{ + auto input = u8"ファストブラウンフォックス"; + auto expected = StringFromHex("83748340835883678375838983458393837483488362834e8358"); + auto actual = String::Convert(input, CODE_PAGE::CP_UTF8, CODE_PAGE::CP_932); + ASSERT_EQ(expected, actual); +} + +TEST_F(StringTest, Convert_UTF8_to_UTF8) +{ + auto input = u8"سريع|brown|ثعلب"; + auto expected = input; + auto actual = String::Convert(input, CODE_PAGE::CP_UTF8, CODE_PAGE::CP_UTF8); + ASSERT_EQ(expected, actual); +} + +TEST_F(StringTest, Convert_Empty) +{ + auto input = ""; + auto expected = input; + auto actual = String::Convert(input, CODE_PAGE::CP_1252, CODE_PAGE::CP_UTF8); + ASSERT_EQ(expected, actual); +} + +#endif diff --git a/test/tests/helpers/StringHelpers.hpp b/test/tests/helpers/StringHelpers.hpp new file mode 100644 index 0000000000..b0d999ba8e --- /dev/null +++ b/test/tests/helpers/StringHelpers.hpp @@ -0,0 +1,17 @@ +#include +#include +#include + +inline std::string StringFromHex(const std::string_view& input) +{ + assert((input.size() & 1) == 0); + + std::string result; + result.reserve(input.size() / 2); + for (size_t i = 0; i < input.size(); i += 2) + { + auto val = std::stoi(std::string(input.substr(i, 2)), 0, 16); + result.push_back(val); + } + return result; +} diff --git a/test/tests/tests.vcxproj b/test/tests/tests.vcxproj index 9fd921b000..8c23f3c620 100644 --- a/test/tests/tests.vcxproj +++ b/test/tests/tests.vcxproj @@ -52,12 +52,14 @@ + +