From c28a42d877f735a92431f80c4c0dda2001e8e60d Mon Sep 17 00:00:00 2001 From: Ted John Date: Sun, 27 May 2018 16:30:19 +0100 Subject: [PATCH] Optimise SawyerChunkReader for MSVC debug builds - Change std::copy_n and std::fill_n back to std::memcpy and std::memset. They do not have the overhead of checks. - Change std::malloc to HeapAlloc as 16 MiB allocations are very slow due to it initialising all the memory to 0xCC. --- src/openrct2/rct12/SawyerChunkReader.cpp | 93 ++++++++++++++++-------- src/openrct2/rct12/SawyerChunkReader.h | 4 + 2 files changed, 67 insertions(+), 30 deletions(-) diff --git a/src/openrct2/rct12/SawyerChunkReader.cpp b/src/openrct2/rct12/SawyerChunkReader.cpp index b94c0ed3ce..f76d57cb50 100644 --- a/src/openrct2/rct12/SawyerChunkReader.cpp +++ b/src/openrct2/rct12/SawyerChunkReader.cpp @@ -1,4 +1,4 @@ -#pragma region Copyright (c) 2014-2017 OpenRCT2 Developers +#pragma region Copyright (c) 2014-2018 OpenRCT2 Developers /***************************************************************************** * OpenRCT2, an open source clone of Roller Coaster Tycoon 2. * @@ -14,12 +14,17 @@ *****************************************************************************/ #pragma endregion -#include #include "../core/IStream.hpp" -#include "../core/Math.hpp" -#include "../core/Memory.hpp" #include "SawyerChunkReader.h" + // malloc is very slow for large allocations in MSVC debug builds as it allocates + // memory on a special debug heap and then initialises all the memory to 0xCC. +#if defined(_WIN32) && defined(DEBUG) +#define __USE_HEAP_ALLOC__ +#define WIN32_LEAN_AND_MEAN +#include +#endif + // Allow chunks to be uncompressed to a maximum of 16 MiB constexpr size_t MAX_UNCOMPRESSED_CHUNK_SIZE = 16 * 1024 * 1024; @@ -74,22 +79,10 @@ std::shared_ptr SawyerChunkReader::ReadChunk() throw SawyerChunkException(EXCEPTION_MSG_CORRUPT_CHUNK_SIZE); } - // Allow 16MiB for chunk data - size_t bufferSize = MAX_UNCOMPRESSED_CHUNK_SIZE; - uint8 * buffer = Memory::Allocate(bufferSize); - if (buffer == nullptr) - { - throw std::runtime_error("Unable to allocate buffer."); - } - - size_t uncompressedLength = DecodeChunk(buffer, bufferSize, compressedData.get(), header); + auto buffer = (uint8 *)AllocateLargeTempBuffer(); + size_t uncompressedLength = DecodeChunk(buffer, MAX_UNCOMPRESSED_CHUNK_SIZE, compressedData.get(), header); Guard::Assert(uncompressedLength != 0, "Encountered zero-sized chunk!"); - buffer = Memory::Reallocate(buffer, uncompressedLength); - if (buffer == nullptr) - { - throw std::runtime_error("Unable to reallocate buffer."); - } - + buffer = (uint8 *)FinaliseLargeTempBuffer(buffer, uncompressedLength); return std::make_shared((SAWYER_ENCODING)header.encoding, buffer, uncompressedLength); } default: @@ -111,16 +104,16 @@ void SawyerChunkReader::ReadChunk(void * dst, size_t length) auto chunkLength = chunk->GetLength(); if (chunkLength > length) { - std::copy_n(chunkData, length, (uint8 *)dst); + std::memcpy(dst, chunkData, length); } else { - std::copy_n(chunkData, chunkLength, (uint8 *)dst); + std::memcpy(dst, chunkData, chunkLength); auto remainingLength = length - chunkLength; if (remainingLength > 0) { auto offset = (uint8 *)dst + chunkLength; - std::fill_n(offset, remainingLength, 0); + std::memset(offset, 0, remainingLength); } } } @@ -135,7 +128,7 @@ size_t SawyerChunkReader::DecodeChunk(void * dst, size_t dstCapacity, const void { throw SawyerChunkException(EXCEPTION_MSG_DESTINATION_TOO_SMALL); } - std::copy_n((const uint8 *)src, header.length, (uint8 *)dst); + std::memcpy(dst, src, header.length); resultLength = header.length; break; case CHUNK_ENCODING_RLE: @@ -155,10 +148,11 @@ size_t SawyerChunkReader::DecodeChunk(void * dst, size_t dstCapacity, const void size_t SawyerChunkReader::DecodeChunkRLERepeat(void * dst, size_t dstCapacity, const void * src, size_t srcLength) { - auto immBufferLength = MAX_UNCOMPRESSED_CHUNK_SIZE; - auto immBuffer = std::make_unique(immBufferLength); - auto immLength = DecodeChunkRLE(immBuffer.get(), immBufferLength, src, srcLength); - return DecodeChunkRepeat(dst, dstCapacity, immBuffer.get(), immLength); + auto immBuffer = AllocateLargeTempBuffer(); + auto immLength = DecodeChunkRLE(immBuffer, MAX_UNCOMPRESSED_CHUNK_SIZE, src, srcLength); + auto size = DecodeChunkRepeat(dst, dstCapacity, immBuffer, immLength); + FreeLargeTempBuffer(immBuffer); + return size; } size_t SawyerChunkReader::DecodeChunkRLE(void * dst, size_t dstCapacity, const void * src, size_t srcLength) @@ -183,7 +177,7 @@ size_t SawyerChunkReader::DecodeChunkRLE(void * dst, size_t dstCapacity, const v throw SawyerChunkException(EXCEPTION_MSG_DESTINATION_TOO_SMALL); } - std::fill_n(dst8, count, src8[i]); + std::memset(dst8, src8[i], count); dst8 += count; } else @@ -197,7 +191,7 @@ size_t SawyerChunkReader::DecodeChunkRLE(void * dst, size_t dstCapacity, const v throw SawyerChunkException(EXCEPTION_MSG_DESTINATION_TOO_SMALL); } - std::copy_n(src8 + i + 1, rleCodeByte + 1, dst8); + std::memcpy(dst8, src8 + i + 1, rleCodeByte + 1); dst8 += rleCodeByte + 1; i += rleCodeByte + 1; } @@ -226,7 +220,7 @@ size_t SawyerChunkReader::DecodeChunkRepeat(void * dst, size_t dstCapacity, cons throw SawyerChunkException(EXCEPTION_MSG_DESTINATION_TOO_SMALL); } - std::copy_n(copySrc, count, dst8); + std::memcpy(dst8, copySrc, count); dst8 += count; } } @@ -250,3 +244,42 @@ size_t SawyerChunkReader::DecodeChunkRotate(void * dst, size_t dstCapacity, cons } return srcLength; } + +void * SawyerChunkReader::AllocateLargeTempBuffer() +{ +#ifdef __USE_HEAP_ALLOC__ + auto buffer = HeapAlloc(GetProcessHeap(), 0, MAX_UNCOMPRESSED_CHUNK_SIZE); +#else + auto buffer = std::malloc(MAX_UNCOMPRESSED_CHUNK_SIZE); +#endif + if (buffer == nullptr) + { + throw std::runtime_error("Unable to allocate large temporary buffer."); + } + return buffer; +} + +void * SawyerChunkReader::FinaliseLargeTempBuffer(void * buffer, size_t len) +{ +#ifdef __USE_HEAP_ALLOC__ + auto finalBuffer = std::malloc(len); + std::memcpy(finalBuffer, buffer, len); + HeapFree(GetProcessHeap(), 0, buffer); +#else + auto finalBuffer = (uint8 *)std::realloc(buffer, len); +#endif + if (finalBuffer == nullptr) + { + throw std::runtime_error("Unable to allocate final buffer."); + } + return finalBuffer; +} + +void SawyerChunkReader::FreeLargeTempBuffer(void * buffer) +{ +#ifdef __USE_HEAP_ALLOC__ + HeapFree(GetProcessHeap(), 0, buffer); +#else + std::free(buffer); +#endif +} diff --git a/src/openrct2/rct12/SawyerChunkReader.h b/src/openrct2/rct12/SawyerChunkReader.h index 2e5dcddaf4..5cf3180750 100644 --- a/src/openrct2/rct12/SawyerChunkReader.h +++ b/src/openrct2/rct12/SawyerChunkReader.h @@ -75,4 +75,8 @@ private: static size_t DecodeChunkRLE(void * dst, size_t dstCapacity, const void * src, size_t srcLength); static size_t DecodeChunkRepeat(void * dst, size_t dstCapacity, const void * src, size_t srcLength); static size_t DecodeChunkRotate(void * dst, size_t dstCapacity, const void * src, size_t srcLength); + + static void * AllocateLargeTempBuffer(); + static void * FinaliseLargeTempBuffer(void * buffer, size_t len); + static void FreeLargeTempBuffer(void * buffer); };