From ce00e7795a18bd936e9fdf3ac4c709ae1f390685 Mon Sep 17 00:00:00 2001 From: berryzplus Date: Sun, 6 Nov 2022 18:57:10 +0900 Subject: [PATCH] =?UTF-8?q?=E6=9B=B8=E3=81=8D=E8=BE=BC=E3=81=BF=E7=AF=84?= =?UTF-8?q?=E5=9B=B2=E3=81=AE=E3=83=81=E3=82=A7=E3=83=83=E3=82=AF=E6=BC=8F?= =?UTF-8?q?=E3=82=8C=E5=AF=BE=E7=AD=96(CUtf7)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- sakura_core/charset/CESI.cpp | 3 +- sakura_core/charset/CUtf7.cpp | 69 ++++++++++++++++------------- sakura_core/charset/CUtf7.h | 2 +- sakura_core/charset/codechecker.cpp | 16 +++---- sakura_core/charset/codechecker.h | 4 +- tests/unittests/test-ccodebase.cpp | 51 +++++++++++++++++++++ 6 files changed, 101 insertions(+), 44 deletions(-) diff --git a/sakura_core/charset/CESI.cpp b/sakura_core/charset/CESI.cpp index 5a3e7f3c9a..bd2037b0ee 100644 --- a/sakura_core/charset/CESI.cpp +++ b/sakura_core/charset/CESI.cpp @@ -361,8 +361,7 @@ void CESI::GetEncodingInfo_eucjp( const char* pS, const int nLen ) */ void CESI::GetEncodingInfo_utf7( const char* pS, const int nLen ) { - const char *pr, *pr_end; - char *pr_next; + const char *pr, *pr_end, *pr_next; int npoints, nlen_setb; bool berror; diff --git a/sakura_core/charset/CUtf7.cpp b/sakura_core/charset/CUtf7.cpp index 5bbb5262b6..064092b6f4 100644 --- a/sakura_core/charset/CUtf7.cpp +++ b/sakura_core/charset/CUtf7.cpp @@ -26,6 +26,9 @@ #include "StdAfx.h" #include "CUtf7.h" + +#include + #include "charset/charcode.h" #include "charset/codechecker.h" #include "convert/convert_util2.h" @@ -79,31 +82,36 @@ int CUtf7::_Utf7SetBToUni_block( const char* pSrc, const int nSrcLen, wchar_t* p int CUtf7::Utf7ToUni( const char* pSrc, const int nSrcLen, wchar_t* pDst, bool* pbError ) { - const char *pr, *pr_end; - char *pr_next; - wchar_t *pw; - int nblocklen=0; - bool berror_tmp, berror=false; + bool berror = false; - pr = pSrc; - pr_end = pSrc + nSrcLen; - pw = pDst; + std::string_view src(pSrc, nSrcLen); + + auto pr = src.cbegin(); + auto pr_end = src.cend(); + auto pr_next = src.begin(); + + auto* pw = pDst; + + while( pr_next < pr_end && *pr ) + { + bool berror_tmp = false; - do{ // UTF-7 Set D 部分のチェック - nblocklen = CheckUtf7DPart( pr, pr_end-pr, &pr_next, &berror_tmp ); - if( berror_tmp == true ){ + auto pr_next2 = &*pr_next; + int nblocklen = CheckUtf7DPart( &*pr, pr_end - pr, &pr_next2, &berror_tmp ); + if( berror_tmp ){ berror = true; } - pw += _Utf7SetDToUni_block( pr, nblocklen, pw ); - - pr = pr_next; // 次の読み込み位置を取得 - if( pr_next >= pr_end ){ + pr_next += pr_next2 - &*pr_next; + pw += _Utf7SetDToUni_block( &*pr, nblocklen, pw ); + if( pr_end <= pr_next || !*pr_next ){ break; } + pr = pr_next; // 次の読み込み位置を取得 // UTF-7 Set B 部分のチェック - nblocklen = CheckUtf7BPart( pr, pr_end-pr, &pr_next, &berror_tmp, UC_LOOSE ); + nblocklen = CheckUtf7BPart( &*pr, pr_end - pr, &pr_next2, &berror_tmp, UC_LOOSE ); + pr_next += pr_next2 - &*pr_next; { // エラーがあってもできるところまでデコード if( berror_tmp ){ @@ -114,14 +122,14 @@ int CUtf7::Utf7ToUni( const char* pSrc, const int nSrcLen, wchar_t* pDst, bool* *pw = L'+'; ++pw; }else{ - pw += _Utf7SetBToUni_block( pr, nblocklen, pw, &berror_tmp ); - if( berror_tmp != false ){ + pw += _Utf7SetBToUni_block( &*pr, nblocklen, pw, &berror_tmp ); + if( berror_tmp ){ berror = true; } } } pr = pr_next; // 次の読み込み位置を取得 - }while( pr_next < pr_end ); + } if( pbError ){ *pbError = berror; @@ -135,7 +143,7 @@ int CUtf7::Utf7ToUni( const char* pSrc, const int nSrcLen, wchar_t* pDst, bool* EConvertResult CUtf7::UTF7ToUnicode( const CMemory& cSrc, CNativeW* pDstMem ) { // エラー状態: - bool bError; + bool bError = false; // データ取得 int nDataLen = cSrc.GetRawLength(); @@ -207,7 +215,7 @@ int CUtf7::_UniToUtf7SetB_block( const wchar_t* pSrc, const int nSrcLen, char* p return pw - pDst; } -int CUtf7::UniToUtf7( const wchar_t* pSrc, const int nSrcLen, char* pDst ) +int CUtf7::UniToUtf7( const wchar_t* pSrc, const int nSrcLen, char* pDst, int nDstLen ) { const wchar_t *pr, *pr_base; const wchar_t* pr_end; @@ -229,10 +237,12 @@ int CUtf7::UniToUtf7( const wchar_t* pSrc, const int nSrcLen, char* pDst ) if( *pr == L'+' ){ // '+' → "+-" - pw[0] = '+'; - pw[1] = '-'; + if( nDstLen < pw + 2 - pDst ){ + break; + } + *(pw++) = '+'; + *(pw++) = '-'; ++pr; - pw += 2; }else{ for( ; pr < pr_end; ++pr ){ if( IsUtf7SetD(*pr) ){ @@ -258,18 +268,15 @@ EConvertResult CUtf7::UnicodeToUTF7( const CNativeW& cSrc, CMemory* pDstMem ) // 出力先バッファの確保 // 最大で、変換元のデータ長の5倍。 - char *pDst = new (std::nothrow) char[ nSrcLen * 5 + 1 ]; // * → +ACo- - if( pDst == NULL ){ - return RESULT_FAILURE; - } + int nDstLen = nSrcLen * 5; + std::string dst( nDstLen, char() ); // * → +ACo- + auto pDst = dst.data(); // 変換 - int nDstLen = UniToUtf7( pSrc, nSrcLen, pDst ); + nDstLen = UniToUtf7( pSrc, nSrcLen, pDst, nDstLen ); // pMem にデータをセット pDstMem->SetRawDataHoldBuffer( pDst, nDstLen ); - delete [] pDst; - return RESULT_COMPLETE; } diff --git a/sakura_core/charset/CUtf7.h b/sakura_core/charset/CUtf7.h index b4708f7f95..697cbe161b 100644 --- a/sakura_core/charset/CUtf7.h +++ b/sakura_core/charset/CUtf7.h @@ -50,6 +50,6 @@ class CUtf7 : public CCodeBase{ static int _UniToUtf7SetD_block( const wchar_t* pSrc, const int nSrcLen, char* pDst ); static int _UniToUtf7SetB_block( const wchar_t* pSrc, const int nSrcLen, char* pDst ); - static int UniToUtf7( const wchar_t* pSrc, const int nSrcLen, char* pDst ); + static int UniToUtf7( const wchar_t* pSrc, const int nSrcLen, char* pDst, int nDstLen ); }; #endif /* SAKURA_CUTF7_55498766_1C8A_416B_9F39_88D3D83B8B65_H_ */ diff --git a/sakura_core/charset/codechecker.cpp b/sakura_core/charset/codechecker.cpp index 49709f0c5d..a62d250617 100644 --- a/sakura_core/charset/codechecker.cpp +++ b/sakura_core/charset/codechecker.cpp @@ -888,7 +888,7 @@ EndFunc:; 戻り値と ppNextChar に格納されるポインタは使えない。 1つ以上のエラーが見つかれば候補から外れるのでそういう適当な仕様に。 */ -int CheckUtf7DPart( const char *pS, const int nLen, char **ppNextChar, bool *pbError ) +int CheckUtf7DPart( const char *pS, size_t nLen, const char **ppNextChar, bool *pbError ) { const char *pr, *pr_end; bool berror = false; @@ -921,11 +921,11 @@ int CheckUtf7DPart( const char *pS, const int nLen, char **ppNextChar, bool *pbE if( pr < pr_end ){ // '+' をスキップ - *ppNextChar = const_cast(pr) + 1; + *ppNextChar = pr + 1; }else{ - *ppNextChar = const_cast(pr); + *ppNextChar = pr; } - return pr - pS; + return static_cast( pr - pS ); } /*! @@ -937,7 +937,7 @@ int CheckUtf7DPart( const char *pS, const int nLen, char **ppNextChar, bool *pbE @note この関数の前に CheckUtf7DPart() が実行される必要がある。 */ -int CheckUtf7BPart( const char *pS, const int nLen, char **ppNextChar, bool *pbError, const int nOption, bool* pbNoAddPoint ) +int CheckUtf7BPart( const char *pS, size_t nLen, const char **ppNextChar, bool *pbError, const int nOption, bool* pbNoAddPoint ) { const char *pr, *pr_end; bool berror_found, bminus_found; @@ -969,7 +969,7 @@ int CheckUtf7BPart( const char *pS, const int nLen, char **ppNextChar, bool *pbE // セットBの文字でなくなるまでループ if( !IsBase64(*pr) ){ if( *pr == '-' ){ - bminus_found= true; + bminus_found = true; }else{ bminus_found = false; } @@ -977,7 +977,7 @@ int CheckUtf7BPart( const char *pS, const int nLen, char **ppNextChar, bool *pbE } } - nchecklen = pr - pS; + nchecklen = static_cast( pr - pS ); // 保護コード if( nchecklen < 1 ){ @@ -1065,7 +1065,7 @@ EndFunc:; if( (berror_found == false || UC_LOOSE == (nOption & UC_LOOSE)) && (pr < pr_end && bminus_found == true) ){ // '-' をスキップ。 - *ppNextChar = const_cast(pr) + 1; + *ppNextChar = pr + 1; }else{ *ppNextChar = const_cast(pr); diff --git a/sakura_core/charset/codechecker.h b/sakura_core/charset/codechecker.h index c0d7fe0d2e..6d4b48a90b 100644 --- a/sakura_core/charset/codechecker.h +++ b/sakura_core/charset/codechecker.h @@ -447,6 +447,6 @@ int CheckUtf8Char( const char*, const int, ECharSet*, const bool bAllow4byteCode int CheckUtf8Char2( const char*, const int, ECharSet*, const bool bAllow4byteCode, const int nOption ); int CheckCesu8Char( const char*, const int, ECharSet*, const int nOption ); // UTF-7 フォーマットチェック -int CheckUtf7DPart( const char*, const int, char **ppNextChar, bool *pbError ); -int CheckUtf7BPart( const char*, const int, char **ppNextChar, bool *pbError, const int nOption, bool *pbNoAddPoint = NULL ); +int CheckUtf7DPart( const char* pS, size_t nLen, const char **ppNextChar, bool *pbError ); +int CheckUtf7BPart( const char* pS, size_t nLen, const char **ppNextChar, bool *pbError, const int nOption, bool *pbNoAddPoint = NULL ); #endif /* SAKURA_CODECHECKER_62A18A31_2ECD_47B6_AEE1_38EDDAD3FF2B_H_ */ diff --git a/tests/unittests/test-ccodebase.cpp b/tests/unittests/test-ccodebase.cpp index 13bde9d1ad..0e62b155c2 100644 --- a/tests/unittests/test-ccodebase.cpp +++ b/tests/unittests/test-ccodebase.cpp @@ -347,6 +347,57 @@ TEST(CCodeBase, codeUtf8_OracleImplementation) ASSERT_TRUE( bComplete2_2 ); } +/*! + * @brief 文字コード変換のテスト + */ +TEST(CCodeBase, codeUtf7) +{ + const auto eCodeType = CODE_UTF7; + auto pCodeBase = CCodeFactory::CreateCodeBase(eCodeType); + + // 7bit ASCII範囲(UTF-7仕様) + constexpr const auto& mbsAscii = "+AAEAAgADAAQABQAGAAcACA-\t\n+AAsADA-\r+AA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAf- +ACEAIgAjACQAJQAm-'()+ACoAKw-,-./0123456789:+ADsAPAA9AD4-?+AEA-ABCDEFGHIJKLMNOPQRSTUVWXYZ+AFsAXABdAF4AXwBg-abcdefghijklmnopqrstuvwxyz+AHsAfAB9AH4Afw-"; + constexpr const auto& wcsAscii = L"\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7F"; + + bool bComplete1_1 = false; + auto encoded1 = pCodeBase->CodeToUnicode(BinarySequenceView(reinterpret_cast(mbsAscii), _countof(mbsAscii)), &bComplete1_1); + EXPECT_STREQ(wcsAscii, encoded1.GetStringPtr()); + EXPECT_TRUE(bComplete1_1); + + bool bComplete1_2 = false; + auto decoded1 = pCodeBase->UnicodeToCode(encoded1, &bComplete1_2); + EXPECT_EQ(0, memcmp(mbsAscii, decoded1.data(), decoded1.size())); + EXPECT_TRUE(bComplete1_2); + + // かな漢字の変換(UTF-7仕様) + constexpr const auto& wcsKanaKanji = L"カナかなカナ漢字"; + constexpr const auto& mbsKanaKanji = "+/3b/hTBLMGowqzDKbyJbVw-"; + + bool bComplete2_1 = false; + auto encoded2 = pCodeBase->CodeToUnicode(BinarySequenceView(reinterpret_cast(mbsKanaKanji), _countof(mbsKanaKanji)), &bComplete2_1); + ASSERT_STREQ(wcsKanaKanji, encoded2.GetStringPtr()); + ASSERT_TRUE(bComplete2_1); + + bool bComplete2_2 = false; + auto decoded2 = pCodeBase->UnicodeToCode(encoded2, &bComplete2_2); + ASSERT_EQ(0, memcmp(mbsKanaKanji, decoded2.data(), decoded2.size())); + ASSERT_TRUE(bComplete2_2); + + // UTF-7仕様 + constexpr const auto& wcsPlusPlus = L"C++"; + constexpr const auto& mbsPlusPlus = "C+-+-"; + + bool bComplete5_1 = false; + auto encoded5 = pCodeBase->CodeToUnicode(BinarySequenceView(reinterpret_cast(mbsPlusPlus), _countof(mbsPlusPlus)), &bComplete5_1); + ASSERT_STREQ(wcsPlusPlus, encoded5.GetStringPtr()); + ASSERT_TRUE(bComplete5_1); + + bool bComplete5_2 = false; + auto decoded5 = pCodeBase->UnicodeToCode(encoded5, &bComplete5_2); + ASSERT_EQ(0, memcmp(mbsPlusPlus, decoded5.data(), decoded5.size())); + ASSERT_TRUE(bComplete5_2); +} + /*! * @brief 文字コード変換のテスト */