1
0
mirror of https://github.com/OpenTTD/OpenTTD synced 2026-01-16 00:42:45 +01:00

Codechange: Use Utf8View::iterator in StringIterator.

This commit is contained in:
frosch
2025-04-01 14:58:16 +02:00
committed by frosch
parent b19e43ae99
commit 83401ad5e2
7 changed files with 53 additions and 79 deletions

View File

@@ -13,6 +13,7 @@
#include "error_func.h"
#include "string_func.h"
#include "string_base.h"
#include "core/utf8.hpp"
#include "table/control_codes.h"
@@ -826,10 +827,8 @@ public:
delete this->word_itr;
}
void SetString(const char *s) override
void SetString(std::string_view s) override
{
const char *string_base = s;
/* Unfortunately current ICU versions only provide rudimentary support
* for word break iterators (especially for CJK languages) in combination
* with UTF-8 input. As a work around we have to convert the input to
@@ -837,10 +836,10 @@ public:
this->utf16_str.clear();
this->utf16_to_utf8.clear();
while (*s != '\0') {
size_t idx = s - string_base;
char32_t c = Utf8Consume(&s);
Utf8View view(s);
for (auto it = view.begin(), end = view.end(); it != end; ++it) {
size_t idx = it.GetByteOffset();
char32_t c = *it;
if (c < 0x10000) {
this->utf16_str.push_back((UChar)c);
} else {
@@ -852,7 +851,7 @@ public:
this->utf16_to_utf8.push_back(idx);
}
this->utf16_str.push_back('\0');
this->utf16_to_utf8.push_back(s - string_base);
this->utf16_to_utf8.push_back(s.size());
UText text = UTEXT_INITIALIZER;
UErrorCode status = U_ZERO_ERROR;
@@ -956,60 +955,43 @@ public:
/** Fallback simple string iterator. */
class DefaultStringIterator : public StringIterator
{
const char *string; ///< Current string.
size_t len; ///< String length.
size_t cur_pos; ///< Current iteration position.
Utf8View string; ///< Current string.
Utf8View::iterator cur_pos; //< Current iteration position.
public:
DefaultStringIterator() : string(nullptr), len(0), cur_pos(0)
{
}
void SetString(const char *s) override
void SetString(std::string_view s) override
{
this->string = s;
this->len = strlen(s);
this->cur_pos = 0;
this->cur_pos = this->string.begin();
}
size_t SetCurPosition(size_t pos) override
{
assert(this->string != nullptr && pos <= this->len);
/* Sanitize in case we get a position inside an UTF-8 sequence. */
while (pos > 0 && IsUtf8Part(this->string[pos])) pos--;
return this->cur_pos = pos;
this->cur_pos = this->string.GetIterAtByte(pos);
return this->cur_pos.GetByteOffset();
}
size_t Next(IterType what) override
{
assert(this->string != nullptr);
const auto end = this->string.end();
/* Already at the end? */
if (this->cur_pos >= this->len) return END;
if (this->cur_pos >= end) return END;
switch (what) {
case ITER_CHARACTER: {
char32_t c;
this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos);
return this->cur_pos;
}
case ITER_CHARACTER:
++this->cur_pos;
return this->cur_pos.GetByteOffset();
case ITER_WORD: {
char32_t c;
case ITER_WORD:
/* Consume current word. */
size_t offs = Utf8Decode(&c, this->string + this->cur_pos);
while (this->cur_pos < this->len && !IsWhitespace(c)) {
this->cur_pos += offs;
offs = Utf8Decode(&c, this->string + this->cur_pos);
while (this->cur_pos != end && !IsWhitespace(*this->cur_pos)) {
++this->cur_pos;
}
/* Consume whitespace to the next word. */
while (this->cur_pos < this->len && IsWhitespace(c)) {
this->cur_pos += offs;
offs = Utf8Decode(&c, this->string + this->cur_pos);
while (this->cur_pos != end && IsWhitespace(*this->cur_pos)) {
++this->cur_pos;
}
return this->cur_pos;
}
return this->cur_pos.GetByteOffset();
default:
NOT_REACHED();
@@ -1020,33 +1002,27 @@ public:
size_t Prev(IterType what) override
{
assert(this->string != nullptr);
const auto begin = this->string.begin();
/* Already at the beginning? */
if (this->cur_pos == 0) return END;
if (this->cur_pos == begin) return END;
switch (what) {
case ITER_CHARACTER:
return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string;
--this->cur_pos;
return this->cur_pos.GetByteOffset();
case ITER_WORD: {
const char *s = this->string + this->cur_pos;
char32_t c;
case ITER_WORD:
/* Consume preceding whitespace. */
do {
s = Utf8PrevChar(s);
Utf8Decode(&c, s);
} while (s > this->string && IsWhitespace(c));
--this->cur_pos;
} while (this->cur_pos != begin && IsWhitespace(*this->cur_pos));
/* Consume preceding word. */
while (s > this->string && !IsWhitespace(c)) {
s = Utf8PrevChar(s);
Utf8Decode(&c, s);
while (this->cur_pos != begin && !IsWhitespace(*this->cur_pos)) {
--this->cur_pos;
}
/* Move caret back to the beginning of the word. */
if (IsWhitespace(c)) Utf8Consume(&s);
return this->cur_pos = s - this->string;
}
if (IsWhitespace(*this->cur_pos)) ++this->cur_pos;
return this->cur_pos.GetByteOffset();
default:
NOT_REACHED();