1
0
mirror of https://github.com/OpenRCT2/OpenRCT2 synced 2026-01-22 22:34:33 +01:00

Refactor string conversion between RCT2 and UTF8

Use Win32 API for conversion on Windows for non-1252 code pages instead of built-in tables.
This commit is contained in:
Ted John
2018-04-14 18:06:49 +01:00
committed by Aaron van Geffen
parent 2512e4959c
commit deaa60f8f1
17 changed files with 343 additions and 250 deletions

View File

@@ -1,4 +1,4 @@
#pragma region Copyright (c) 2014-2017 OpenRCT2 Developers
#pragma region Copyright (c) 2014-2018 OpenRCT2 Developers
/*****************************************************************************
* OpenRCT2, an open source clone of Roller Coaster Tycoon 2.
*
@@ -14,101 +14,172 @@
*****************************************************************************/
#pragma endregion
#include <algorithm>
#include <limits>
#include <stdexcept>
#include "../core/String.hpp"
#include "../core/Util.hpp"
#include "ConversionTables.h"
#include "Localisation.h"
sint32 rct2_to_utf8(utf8 *dst, const char *src)
/**
* Decodes an RCT2 string to a wide char string still in the original code page.
* An RCT2 string is a multi-byte string where every two-byte code point is preceeded with a byte value of 255.
*/
static std::wstring DecodeToWideChar(const std::string_view& src)
{
wchar_t codepoint;
std::wstring decoded;
decoded.reserve(src.size());
for (auto it = src.begin(); it != src.end(); it++)
{
uint8_t c = *it;
if (c == 255)
{
// Push next two characters
uint8 a = 0;
uint8 b = 0;
if (++it != src.end())
{
a = *it;
if (++it != src.end())
{
b = *it;
}
}
utf8 *start = dst;
const char *ch = src;
while (*ch != 0) {
if (*ch == (char)(uint8)0xFF) {
ch++;
// Read wide char
uint8 a = *ch++;
uint8 b = *ch++;
codepoint = (a << 8) | b;
} else {
codepoint = (uint8)(*ch++);
codepoint = encoding_convert_rct2_to_unicode(codepoint);
wchar_t cp = (a << 8) | b;
decoded.push_back(cp);
}
else
{
// Push character
decoded.push_back(c);
}
dst = utf8_write_codepoint(dst, codepoint);
}
dst = utf8_write_codepoint(dst, 0);
return (sint32)(dst - start);
return decoded;
}
sint32 utf8_to_rct2(char *dst, const utf8 *src)
static std::string DecodeToMultiByte(const std::string_view& src)
{
char *start = dst;
const utf8 *ch = src;
auto wide = DecodeToWideChar(src);
std::string result;
result.reserve(wide.size());
for (auto cc : wide)
{
if (cc <= 255)
{
result.push_back(cc);
}
else
{
result.push_back((cc >> 8) & 0xFF);
result.push_back(cc & 0xFF);
}
}
return result;
}
/**
* Encodes a UTF-8 string as an RCT2 string.
*/
static std::string Encode(const std::string_view& src)
{
std::string dst;
const utf8 * ch = src.data();
sint32 codepoint;
while ((codepoint = utf8_get_next(ch, &ch)) != 0) {
while ((codepoint = utf8_get_next(ch, &ch)) != 0)
{
codepoint = encoding_convert_unicode_to_rct2(codepoint);
if (codepoint < 256) {
*dst++ = (char)codepoint;
} else if (codepoint <= 0xFFFF) {
*dst++ = (char)(uint8)0xFF;
*dst++ = (codepoint >> 8) & 0xFF;
*dst++ = codepoint & 0xFF;
if (codepoint <= std::numeric_limits<uint8>::max())
{
dst.push_back(codepoint);
}
else if (codepoint <= std::numeric_limits<uint16>::max())
{
dst.push_back((char)(uint8)0xFF);
dst.push_back((codepoint >> 8) & 0xFF);
dst.push_back(codepoint & 0xFF);
}
else
{
// RCT2 strings do not support code points greater than 65535, replace them with '?'
dst.push_back('?');
}
}
*dst++ = 0;
return (sint32)(dst - start);
return dst;
}
static sint32 encoding_search_compare(const void *pKey, const void *pEntry)
static sint32 GetCodePageForRCT2Language(RCT2LanguageId languageId)
{
uint16 key = *((uint16*)pKey);
encoding_convert_entry *entry = (encoding_convert_entry*)pEntry;
if (key < entry->code) return -1;
if (key > entry->code) return 1;
return 0;
}
static wchar_t encoding_convert_x_to_unicode(wchar_t code, const encoding_convert_entry *table, size_t count)
{
encoding_convert_entry * entry = (encoding_convert_entry *)bsearch(&code, table, count, sizeof(encoding_convert_entry), encoding_search_compare);
if (entry == nullptr) return code;
else return entry->unicode;
}
uint32 encoding_convert_unicode_to_rct2(uint32 unicode)
{
// Can't do a binary search as it's sorted by RCT2 code, not unicode
for (uint32 i = 0; i < Util::CountOf(RCT2ToUnicodeTable); i++) {
if (RCT2ToUnicodeTable[i].unicode == unicode) return RCT2ToUnicodeTable[i].code;
switch (languageId)
{
case RCT2_LANGUAGE_ID_JAPANESE:
return CODE_PAGE::CP_932;
case RCT2_LANGUAGE_ID_CHINESE_SIMPLIFIED:
return CODE_PAGE::CP_936;
case RCT2_LANGUAGE_ID_KOREAN:
return CODE_PAGE::CP_949;
case RCT2_LANGUAGE_ID_CHINESE_TRADITIONAL:
return CODE_PAGE::CP_950;
default:
return CODE_PAGE::CP_1252;
}
return unicode;
}
wchar_t encoding_convert_rct2_to_unicode(wchar_t rct2str)
template<typename TConvertFunc>
static std::string DecodeConvertWithTable(const std::string_view& src, TConvertFunc func)
{
return encoding_convert_x_to_unicode(rct2str, RCT2ToUnicodeTable, Util::CountOf(RCT2ToUnicodeTable));
auto decoded = DecodeToWideChar(src);
std::wstring u16;
u16.reserve(decoded.size());
for (auto cc : decoded)
{
u16.push_back(func(cc));
}
return String::ToUtf8(u16);
}
wchar_t encoding_convert_gb2312_to_unicode(wchar_t gb2312)
std::string rct2_to_utf8(const std::string_view& src, RCT2LanguageId languageId)
{
return encoding_convert_x_to_unicode(gb2312 - 0x8080, GB2312ToUnicodeTable, Util::CountOf(GB2312ToUnicodeTable));
auto codePage = GetCodePageForRCT2Language(languageId);
std::string result;
switch (codePage)
{
case CODE_PAGE::CP_1252:
// The code page used by RCT2 was not quite 1252 as some codes were used for Polish characters.
result = DecodeConvertWithTable(src, encoding_convert_rct2_to_unicode);
break;
#ifdef _WIN32
default:
auto decoded = DecodeToMultiByte(src);
result = String::Convert(decoded, codePage, CODE_PAGE::CP_UTF8);
#else
// TODO Change this to use a library such as libicu
case CODE_PAGE::CP_932:
result = DecodeConvertWithTable(src, encoding_convert_cp932_to_unicode);
break;
case CODE_PAGE::CP_936:
result = DecodeConvertWithTable(src, encoding_convert_gb2312_to_unicode);
break;
case CODE_PAGE::CP_949:
result = DecodeConvertWithTable(src, encoding_convert_cp949_to_unicode);
break;
case CODE_PAGE::CP_950:
result = DecodeConvertWithTable(src, encoding_convert_big5_to_unicode);
break;
default:
throw std::runtime_error("Unsupported code page: " + std::to_string(codePage));
break;
#endif
}
return result;
}
wchar_t encoding_convert_big5_to_unicode(wchar_t big5)
std::string utf8_to_rct2(const std::string_view& src)
{
return encoding_convert_x_to_unicode(big5, Big5ToUnicodeTable, Util::CountOf(Big5ToUnicodeTable));
// NOTE: This is only used for SC6 / SV6 files which don't store the language identifier
// because of this, we can only store in RCT2's CP_1252 format. We can preserve some
// unicode characters, but only those between 256 and 65535.
return Encode(src);
}
wchar_t encoding_convert_cp932_to_unicode(wchar_t cp932)
{
return encoding_convert_x_to_unicode(cp932, CP932ToUnicodeTable, Util::CountOf(CP932ToUnicodeTable));
}
wchar_t encoding_convert_cp949_to_unicode(wchar_t cp949)
{
return encoding_convert_x_to_unicode(cp949, CP949ToUnicodeTable, Util::CountOf(CP949ToUnicodeTable));
}