From ce00e7795a18bd936e9fdf3ac4c709ae1f390685 Mon Sep 17 00:00:00 2001
From: berryzplus <berryzplus@gmail.com>
Date: Sun, 6 Nov 2022 18:57:10 +0900
Subject: [PATCH] =?UTF-8?q?=E6=9B=B8=E3=81=8D=E8=BE=BC=E3=81=BF=E7=AF=84?=
 =?UTF-8?q?=E5=9B=B2=E3=81=AE=E3=83=81=E3=82=A7=E3=83=83=E3=82=AF=E6=BC=8F?=
 =?UTF-8?q?=E3=82=8C=E5=AF=BE=E7=AD=96(CUtf7)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 sakura_core/charset/CESI.cpp        |  3 +-
 sakura_core/charset/CUtf7.cpp       | 69 ++++++++++++++++-------------
 sakura_core/charset/CUtf7.h         |  2 +-
 sakura_core/charset/codechecker.cpp | 16 +++----
 sakura_core/charset/codechecker.h   |  4 +-
 tests/unittests/test-ccodebase.cpp  | 51 +++++++++++++++++++++
 6 files changed, 101 insertions(+), 44 deletions(-)

diff --git a/sakura_core/charset/CESI.cpp b/sakura_core/charset/CESI.cpp
index 5a3e7f3c9a..bd2037b0ee 100644
--- a/sakura_core/charset/CESI.cpp
+++ b/sakura_core/charset/CESI.cpp
@@ -361,8 +361,7 @@ void CESI::GetEncodingInfo_eucjp( const char* pS, const int nLen )
 */
 void CESI::GetEncodingInfo_utf7( const char* pS, const int nLen )
 {
-	const char *pr, *pr_end;
-	char *pr_next;
+	const char *pr, *pr_end, *pr_next;
 	int npoints, nlen_setb;
 	bool berror;
 
diff --git a/sakura_core/charset/CUtf7.cpp b/sakura_core/charset/CUtf7.cpp
index 5bbb5262b6..064092b6f4 100644
--- a/sakura_core/charset/CUtf7.cpp
+++ b/sakura_core/charset/CUtf7.cpp
@@ -26,6 +26,9 @@
 
 #include "StdAfx.h"
 #include "CUtf7.h"
+
+#include <string_view>
+
 #include "charset/charcode.h"
 #include "charset/codechecker.h"
 #include "convert/convert_util2.h"
@@ -79,31 +82,36 @@ int CUtf7::_Utf7SetBToUni_block( const char* pSrc, const int nSrcLen, wchar_t* p
 
 int CUtf7::Utf7ToUni( const char* pSrc, const int nSrcLen, wchar_t* pDst, bool* pbError )
 {
-	const char *pr, *pr_end;
-	char *pr_next;
-	wchar_t *pw;
-	int nblocklen=0;
-	bool berror_tmp, berror=false;
+	bool berror = false;
 
-	pr = pSrc;
-	pr_end = pSrc + nSrcLen;
-	pw = pDst;
+	std::string_view src(pSrc, nSrcLen);
+
+	auto pr      = src.cbegin();
+	auto pr_end  = src.cend();
+	auto pr_next = src.begin();
+
+	auto* pw = pDst;
+
+	while( pr_next < pr_end && *pr )
+	{
+		bool berror_tmp = false;
 
-	do{
 		// UTF-7 Set D 部分のチェック
-		nblocklen = CheckUtf7DPart( pr, pr_end-pr, &pr_next, &berror_tmp );
-		if( berror_tmp == true ){
+		auto pr_next2 = &*pr_next;
+		int nblocklen = CheckUtf7DPart( &*pr, pr_end - pr, &pr_next2, &berror_tmp );
+		if( berror_tmp ){
 			berror = true;
 		}
-		pw += _Utf7SetDToUni_block( pr, nblocklen, pw );
-
-		pr = pr_next;  // 次の読み込み位置を取得
-		if( pr_next >= pr_end ){
+		pr_next += pr_next2 - &*pr_next;
+		pw += _Utf7SetDToUni_block( &*pr, nblocklen, pw );
+		if( pr_end <= pr_next || !*pr_next ){
 			break;
 		}
+		pr = pr_next;  // 次の読み込み位置を取得
 
 		// UTF-7 Set B 部分のチェック
-		nblocklen = CheckUtf7BPart( pr, pr_end-pr, &pr_next, &berror_tmp, UC_LOOSE );
+		nblocklen = CheckUtf7BPart( &*pr, pr_end - pr, &pr_next2, &berror_tmp, UC_LOOSE );
+		pr_next += pr_next2 - &*pr_next;
 		{
 			// エラーがあってもできるところまでデコード
 			if( berror_tmp ){
@@ -114,14 +122,14 @@ int CUtf7::Utf7ToUni( const char* pSrc, const int nSrcLen, wchar_t* pDst, bool*
 				*pw = L'+';
 				++pw;
 			}else{
-				pw += _Utf7SetBToUni_block( pr, nblocklen, pw, &berror_tmp );
-				if( berror_tmp != false ){
+				pw += _Utf7SetBToUni_block( &*pr, nblocklen, pw, &berror_tmp );
+				if( berror_tmp ){
 					berror = true;
 				}
 			}
 		}
 		pr = pr_next;  // 次の読み込み位置を取得
-	}while( pr_next < pr_end );
+	}
 
 	if( pbError ){
 		*pbError = berror;
@@ -135,7 +143,7 @@ int CUtf7::Utf7ToUni( const char* pSrc, const int nSrcLen, wchar_t* pDst, bool*
 EConvertResult CUtf7::UTF7ToUnicode( const CMemory& cSrc, CNativeW* pDstMem )
 {
 	// エラー状態：
-	bool bError;
+	bool bError = false;
 
 	// データ取得
 	int nDataLen = cSrc.GetRawLength();
@@ -207,7 +215,7 @@ int CUtf7::_UniToUtf7SetB_block( const wchar_t* pSrc, const int nSrcLen, char* p
 	return pw - pDst;
 }
 
-int CUtf7::UniToUtf7( const wchar_t* pSrc, const int nSrcLen, char* pDst )
+int CUtf7::UniToUtf7( const wchar_t* pSrc, const int nSrcLen, char* pDst, int nDstLen )
 {
 	const wchar_t *pr, *pr_base;
 	const wchar_t* pr_end;
@@ -229,10 +237,12 @@ int CUtf7::UniToUtf7( const wchar_t* pSrc, const int nSrcLen, char* pDst )
 
 		if( *pr == L'+' ){
 			// '+' → "+-"
-			pw[0] = '+';
-			pw[1] = '-';
+			if( nDstLen < pw + 2 - pDst ){
+				break;
+			}
+			*(pw++) = '+';
+			*(pw++) = '-';
 			++pr;
-			pw += 2;
 		}else{
 			for( ; pr < pr_end; ++pr ){
 				if( IsUtf7SetD(*pr) ){
@@ -258,18 +268,15 @@ EConvertResult CUtf7::UnicodeToUTF7( const CNativeW& cSrc, CMemory* pDstMem )
 
 	// 出力先バッファの確保
 	// 最大で、変換元のデータ長の５倍。
-	char *pDst = new (std::nothrow) char[ nSrcLen * 5 + 1 ];  // * → +ACo-
-	if( pDst == NULL ){
-		return RESULT_FAILURE;
-	}
+	int nDstLen = nSrcLen * 5;
+	std::string dst( nDstLen, char() );  // * → +ACo-
+	auto pDst = dst.data();
 
 	// 変換
-	int nDstLen = UniToUtf7( pSrc, nSrcLen, pDst );
+	nDstLen = UniToUtf7( pSrc, nSrcLen, pDst, nDstLen );
 
 	// pMem にデータをセット
 	pDstMem->SetRawDataHoldBuffer( pDst, nDstLen );
 
-	delete [] pDst;
-
 	return RESULT_COMPLETE;
 }
diff --git a/sakura_core/charset/CUtf7.h b/sakura_core/charset/CUtf7.h
index b4708f7f95..697cbe161b 100644
--- a/sakura_core/charset/CUtf7.h
+++ b/sakura_core/charset/CUtf7.h
@@ -50,6 +50,6 @@ class CUtf7 : public CCodeBase{
 
 	static int _UniToUtf7SetD_block( const wchar_t* pSrc, const int nSrcLen, char* pDst );
 	static int _UniToUtf7SetB_block( const wchar_t* pSrc, const int nSrcLen, char* pDst );
-	static int UniToUtf7( const wchar_t* pSrc, const int nSrcLen, char* pDst );
+	static int UniToUtf7( const wchar_t* pSrc, const int nSrcLen, char* pDst, int nDstLen );
 };
 #endif /* SAKURA_CUTF7_55498766_1C8A_416B_9F39_88D3D83B8B65_H_ */
diff --git a/sakura_core/charset/codechecker.cpp b/sakura_core/charset/codechecker.cpp
index 49709f0c5d..a62d250617 100644
--- a/sakura_core/charset/codechecker.cpp
+++ b/sakura_core/charset/codechecker.cpp
@@ -888,7 +888,7 @@ EndFunc:;
 	戻り値と ppNextChar に格納されるポインタは使えない。
 	1つ以上のエラーが見つかれば候補から外れるのでそういう適当な仕様に。
 */
-int CheckUtf7DPart( const char *pS, const int nLen, char **ppNextChar, bool *pbError )
+int CheckUtf7DPart( const char *pS, size_t nLen, const char **ppNextChar, bool *pbError )
 {
 	const char *pr, *pr_end;
 	bool berror = false;
@@ -921,11 +921,11 @@ int CheckUtf7DPart( const char *pS, const int nLen, char **ppNextChar, bool *pbE
 
 	if( pr < pr_end ){
 		// '+' をスキップ
-		*ppNextChar = const_cast<char*>(pr) + 1;
+		*ppNextChar = pr + 1;
 	}else{
-		*ppNextChar = const_cast<char*>(pr);
+		*ppNextChar = pr;
 	}
-	return pr - pS;
+	return static_cast<int>( pr - pS );
 }
 
 /*!
@@ -937,7 +937,7 @@ int CheckUtf7DPart( const char *pS, const int nLen, char **ppNextChar, bool *pbE
 
 	@note この関数の前に CheckUtf7DPart() が実行される必要がある。
 */
-int CheckUtf7BPart( const char *pS, const int nLen, char **ppNextChar, bool *pbError, const int nOption, bool* pbNoAddPoint )
+int CheckUtf7BPart( const char *pS, size_t nLen, const char **ppNextChar, bool *pbError, const int nOption, bool* pbNoAddPoint )
 {
 	const char *pr, *pr_end;
 	bool berror_found, bminus_found;
@@ -969,7 +969,7 @@ int CheckUtf7BPart( const char *pS, const int nLen, char **ppNextChar, bool *pbE
 		// セットＢの文字でなくなるまでループ
 		if( !IsBase64(*pr) ){
 			if( *pr == '-' ){
-				bminus_found= true;
+				bminus_found = true;
 			}else{
 				bminus_found = false;
 			}
@@ -977,7 +977,7 @@ int CheckUtf7BPart( const char *pS, const int nLen, char **ppNextChar, bool *pbE
 		}
 	}
 
-	nchecklen = pr - pS;
+	nchecklen = static_cast<int>( pr - pS );
 
 	// 保護コード
 	if( nchecklen < 1 ){
@@ -1065,7 +1065,7 @@ EndFunc:;
 
 	if( (berror_found == false || UC_LOOSE == (nOption & UC_LOOSE)) && (pr < pr_end && bminus_found == true) ){
 		// '-' をスキップ。
-		*ppNextChar = const_cast<char*>(pr) + 1;
+		*ppNextChar = pr + 1;
 	}else{
 		*ppNextChar = const_cast<char*>(pr);
 
diff --git a/sakura_core/charset/codechecker.h b/sakura_core/charset/codechecker.h
index c0d7fe0d2e..6d4b48a90b 100644
--- a/sakura_core/charset/codechecker.h
+++ b/sakura_core/charset/codechecker.h
@@ -447,6 +447,6 @@ int CheckUtf8Char( const char*, const int, ECharSet*, const bool bAllow4byteCode
 int CheckUtf8Char2( const char*, const int, ECharSet*, const bool bAllow4byteCode, const int nOption );
 int CheckCesu8Char( const char*, const int, ECharSet*, const int nOption );
 // UTF-7 フォーマットチェック
-int CheckUtf7DPart( const char*, const int, char **ppNextChar, bool *pbError );
-int CheckUtf7BPart( const char*, const int, char **ppNextChar, bool *pbError, const int nOption, bool *pbNoAddPoint = NULL );
+int CheckUtf7DPart( const char* pS, size_t nLen, const char **ppNextChar, bool *pbError );
+int CheckUtf7BPart( const char* pS, size_t nLen, const char **ppNextChar, bool *pbError, const int nOption, bool *pbNoAddPoint = NULL );
 #endif /* SAKURA_CODECHECKER_62A18A31_2ECD_47B6_AEE1_38EDDAD3FF2B_H_ */
diff --git a/tests/unittests/test-ccodebase.cpp b/tests/unittests/test-ccodebase.cpp
index 13bde9d1ad..0e62b155c2 100644
--- a/tests/unittests/test-ccodebase.cpp
+++ b/tests/unittests/test-ccodebase.cpp
@@ -347,6 +347,57 @@ TEST(CCodeBase, codeUtf8_OracleImplementation)
 	ASSERT_TRUE( bComplete2_2 );
 }
 
+/*!
+ * @brief 文字コード変換のテスト
+ */
+TEST(CCodeBase, codeUtf7)
+{
+	const auto eCodeType = CODE_UTF7;
+	auto pCodeBase = CCodeFactory::CreateCodeBase(eCodeType);
+
+	// 7bit ASCII範囲（UTF-7仕様）
+	constexpr const auto& mbsAscii = "+AAEAAgADAAQABQAGAAcACA-\t\n+AAsADA-\r+AA4ADwAQABEAEgATABQAFQAWABcAGAAZABoAGwAcAB0AHgAf- +ACEAIgAjACQAJQAm-'()+ACoAKw-,-./0123456789:+ADsAPAA9AD4-?+AEA-ABCDEFGHIJKLMNOPQRSTUVWXYZ+AFsAXABdAF4AXwBg-abcdefghijklmnopqrstuvwxyz+AHsAfAB9AH4Afw-";
+	constexpr const auto& wcsAscii = L"\x01\x02\x03\x04\x05\x06\a\b\t\n\v\f\r\x0E\x0F\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1A\x1B\x1C\x1D\x1E\x1F !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7F";
+
+	bool bComplete1_1 = false;
+	auto encoded1 = pCodeBase->CodeToUnicode(BinarySequenceView(reinterpret_cast<const std::byte*>(mbsAscii), _countof(mbsAscii)), &bComplete1_1);
+	EXPECT_STREQ(wcsAscii, encoded1.GetStringPtr());
+	EXPECT_TRUE(bComplete1_1);
+
+	bool bComplete1_2 = false;
+	auto decoded1 = pCodeBase->UnicodeToCode(encoded1, &bComplete1_2);
+	EXPECT_EQ(0, memcmp(mbsAscii, decoded1.data(), decoded1.size()));
+	EXPECT_TRUE(bComplete1_2);
+
+	// かな漢字の変換（UTF-7仕様）
+	constexpr const auto& wcsKanaKanji = L"ｶﾅかなカナ漢字";
+	constexpr const auto& mbsKanaKanji = "+/3b/hTBLMGowqzDKbyJbVw-";
+
+	bool bComplete2_1 = false;
+	auto encoded2 = pCodeBase->CodeToUnicode(BinarySequenceView(reinterpret_cast<const std::byte*>(mbsKanaKanji), _countof(mbsKanaKanji)), &bComplete2_1);
+	ASSERT_STREQ(wcsKanaKanji, encoded2.GetStringPtr());
+	ASSERT_TRUE(bComplete2_1);
+
+	bool bComplete2_2 = false;
+	auto decoded2 = pCodeBase->UnicodeToCode(encoded2, &bComplete2_2);
+	ASSERT_EQ(0, memcmp(mbsKanaKanji, decoded2.data(), decoded2.size()));
+	ASSERT_TRUE(bComplete2_2);
+
+	// UTF-7仕様
+	constexpr const auto& wcsPlusPlus = L"C++";
+	constexpr const auto& mbsPlusPlus = "C+-+-";
+
+	bool bComplete5_1 = false;
+	auto encoded5 = pCodeBase->CodeToUnicode(BinarySequenceView(reinterpret_cast<const std::byte*>(mbsPlusPlus), _countof(mbsPlusPlus)), &bComplete5_1);
+	ASSERT_STREQ(wcsPlusPlus, encoded5.GetStringPtr());
+	ASSERT_TRUE(bComplete5_1);
+
+	bool bComplete5_2 = false;
+	auto decoded5 = pCodeBase->UnicodeToCode(encoded5, &bComplete5_2);
+	ASSERT_EQ(0, memcmp(mbsPlusPlus, decoded5.data(), decoded5.size()));
+	ASSERT_TRUE(bComplete5_2);
+}
+
 /*!
  * @brief 文字コード変換のテスト
  */