From d4781cc3884b3c0ea22457d14c780213a2e66d37 Mon Sep 17 00:00:00 2001 From: xiexu <1019362424@qq.com> Date: Wed, 29 Nov 2023 22:16:18 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E4=BF=AE=E6=94=B9def=20extract=5Fzip(file,?= =?UTF-8?q?=20password,=20extract=5Ffull=5Fpath)=E8=A7=A3=E5=86=B3?= =?UTF-8?q?=E6=96=87=E4=BB=B6=E5=A4=B9=E4=B9=B1=E7=A0=81=E4=BB=A5=E5=8F=8A?= =?UTF-8?q?=E5=A4=9A=E5=B1=82=E5=8E=8B=E7=BC=A9=E6=97=A0=E6=B3=95=E8=A7=A3?= =?UTF-8?q?=E5=8E=8B=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .DS_Store | Bin 0 -> 6148 bytes .vscode/launch.json | 15 ++ corpus_processing/extract.py | 85 +++++++---- tobereomve.txt | 289 +++++++++++++++++++++++++++++++++++ 4 files changed, 362 insertions(+), 27 deletions(-) create mode 100644 .DS_Store create mode 100644 .vscode/launch.json create mode 100644 tobereomve.txt diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..a05ccc41b9247b8d913170dc98c043801996e803 GIT binary patch literal 6148 zcmeHK(Q4Z;6unB*#v!a^u(3S~`Wk53g|V?0leRxFkL{r=t{t1s%(H^+)DI!x*Y1b* zC;OhBE2$%0y4TPZU2}Eht9vAL9Hd+#64OO;L^LEK56;->qj|u1oPEPOHnIm4dXAJz zs^g}b$N5IK3p_>z_}lepMv4l$rnT>4-P+k48P|;3Uyg)`D&wr+`!7zfpkC2MK5NEw%>r z(Sb&n0KhK7+R*1(f;rNnZ?QFq5tuMkprHy|VhBS=T-rR}Vr$UQN!a2;*p-E?P=sC` z^Glsh!Z+wzr+`!7z5+Yub;$Ss*}v=m`y}_~6mSYWDFsCLG&r4NOZINv*c{)r5`GD1 p3}#joMo5SQEm`W9P*h``(*0WE`ToB~@_;1)JJZQK9= literal 0 HcmV?d00001 diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 0000000..890202a --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,15 @@ +{ + "version": "0.2.0", + "configurations": [ + + { + "name": "Python: Debug convert.py", + "type": "python", + "request": "launch", + "program": "${workspaceFolder}/corpus_processing/extract.py", + "args": ["--folder_path","/Volumes/Elements/AITest/test/1256/", "--passwords_files","/Volumes/Elements/AITest/test/README_1.txt"], + // "args": ["--src_dir", "/Volumes/Elements/AITest/testlog.zip", "--dst_dir", "/Volumes/Elements/AITest/AITest", "--n_process", "4", "--threshold", "0.7"], + "console": "integratedTerminal" + } + ] +} \ No newline at end of file diff --git a/corpus_processing/extract.py b/corpus_processing/extract.py index 457b143..9fa42fa 100644 --- a/corpus_processing/extract.py +++ b/corpus_processing/extract.py @@ -23,6 +23,18 @@ def get_extension(file_path): filename = filename_1 return filename, ''.join(extensions) +def test_encode(file_path : bytes): + try: + bytes.decode(file_path, encoding='utf-8') + return 'utf-8' + except: + pass + try: + bytes.decode(file_path, encoding='gb18030') + return 'gb18030' + except: + pass + return None def check_long_name(extract_full_path, zip_file_name):# longname返回true paths = zip_file_name.split('/') @@ -47,34 +59,53 @@ def check_long_name(extract_full_path, zip_file_name):# longname返回true def extract_zip(file, password, extract_full_path): - - with fixcharset_zipfile.ZipFile(file, 'r') as zip: - zip.setpassword(password) - - auto_filelists = [] - - for file in zip.namelist(): - problem = False - if file.endswith('/'): - continue - - new_file_path, if_long_name = check_long_name(extract_full_path, file) - if if_long_name: - problem = True - - if problem: + try: + with zipfile.ZipFile(file, 'r') as zip: + zip.setpassword(password) + for file_info in zip.infolist(): + file = file_info.filename + if file.endswith('/') or file.startswith('__MACOSX') : + continue + + try: + file_bytes = file.encode('cp437') + except: + file_bytes = file.encode('utf-8') + + coding_name = test_encode(file_bytes) + + if coding_name is None: + coding_name = api.from_data(file_bytes, mode=2) + + utf8_name = api.convert_encoding( + source_data=file_bytes, + source_encoding=coding_name, + target_encoding="utf-8", + ) + new_string = utf8_name.split('/', 1)[1] + new_file_path ,_ = check_long_name(extract_full_path, new_string) basename = os.path.dirname(new_file_path) - os.makedirs(basename, exist_ok=True) - with zip.open(file, 'r') as f_in: - data = f_in.read() - with open(new_file_path, 'wb') as f_out: - f_out.write(data) - else: - auto_filelists.append(file) - - zip.extractall(extract_full_path, auto_filelists) - + os.makedirs(basename, exist_ok=True) + # 复制文件 + source = zip.open(file_info) + with open(new_file_path, "wb") as target: + shutil.copyfileobj(source, target) + print("解压缩完成") + extract_succcessful = True + except zipfile.BadZipFile: + print("错误:无效的ZIP文件。") + extract_succcessful = False + except FileNotFoundError: + print("错误:文件未找到。") + extract_succcessful = False + except IOError as e: + print(f"输入/输出错误:{e}") + extract_succcessful = False + except Exception as e: + print(f"发生了一个未知错误:{e}") + extract_succcessful = False + return extract_succcessful @@ -139,7 +170,7 @@ def extract_archive(file_path, extract_full_path, file, password=None): with open(os.path.join(extract_full_path, filename), 'wb') as f_out: shutil.copyfileobj(f_in, f_out) elif extension in ('.zip', '.exe'): - extract_zip(file_path, password, extract_full_path) + extract_succcessful = extract_zip(file_path, password, extract_full_path) elif extension == '.7z': with py7zr.SevenZipFile(file_path, mode='r', password=password) as seven_zip: diff --git a/tobereomve.txt b/tobereomve.txt new file mode 100644 index 0000000..bbd1986 --- /dev/null +++ b/tobereomve.txt @@ -0,0 +1,289 @@ +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123/123/454.zip +/Volumes/Elements/AITest/test/11/123/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123/123/454.zip +/Volumes/Elements/AITest/test/11/123/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123_1/123/454.zip +/Volumes/Elements/AITest/test/11/123_1/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123_1/123/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_1/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_1/__MACOSX/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/._123.zip +/Volumes/Elements/AITest/test/11/123/454.zip +/Volumes/Elements/AITest/test/11/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123_2/123/454.zip +/Volumes/Elements/AITest/test/11/123_2/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123/454.zip +/Volumes/Elements/AITest/test/11/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123_1/123/454.zip +/Volumes/Elements/AITest/test/11/123_1/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123_4/454.zip +/Volumes/Elements/AITest/test/11/123_4/454的副本.exe +/Volumes/Elements/AITest/test/11/123_4/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_4/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_4/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/._123.zip +/Volumes/Elements/AITest/test/11/123/454.zip +/Volumes/Elements/AITest/test/11/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_1/123/454.zip +/Volumes/Elements/AITest/test/11/123_1/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123_1/123/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_1/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_1/__MACOSX/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_2/123/454.zip +/Volumes/Elements/AITest/test/11/123_2/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123_2/123/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_2/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_2/__MACOSX/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_3/123/454.zip +/Volumes/Elements/AITest/test/11/123_3/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123_3/123/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_3/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_3/__MACOSX/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123_5/454.zip +/Volumes/Elements/AITest/test/11/123_5/454的副本.exe +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123_6/454.zip +/Volumes/Elements/AITest/test/11/123_6/454的副本.exe +/Volumes/Elements/AITest/test/11/123_6/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_6/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_6/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/._123.zip +/Volumes/Elements/AITest/test/11/123/454.zip +/Volumes/Elements/AITest/test/11/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_1/123/454.zip +/Volumes/Elements/AITest/test/11/123_1/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123_1/123/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_1/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_1/__MACOSX/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_2/123/454.zip +/Volumes/Elements/AITest/test/11/123_2/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123_2/123/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_2/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_2/__MACOSX/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_3/123/454.zip +/Volumes/Elements/AITest/test/11/123_3/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123_3/123/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_3/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_3/__MACOSX/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_4/454.zip +/Volumes/Elements/AITest/test/11/123_4/454的副本.exe +/Volumes/Elements/AITest/test/11/123_4/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_4/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_4/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_5/454.zip +/Volumes/Elements/AITest/test/11/123_5/454的副本.exe +/Volumes/Elements/AITest/test/11/123_5/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_5/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_5/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123-1/454.zip +/Volumes/Elements/AITest/test/11/123-1/454的副本.exe +/Volumes/Elements/AITest/test/11/123-1/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123-1/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123-1/._454.zip +/Volumes/Elements/AITest/test/11/123-1/._454的副本.exe +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123_7/123/454.zip +/Volumes/Elements/AITest/test/11/123_7/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123/454.zip +/Volumes/Elements/AITest/test/11/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123_1/123/454.zip +/Volumes/Elements/AITest/test/11/123_1/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123_2/123/454.zip +/Volumes/Elements/AITest/test/11/123_2/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123_3/123/454.zip +/Volumes/Elements/AITest/test/11/123_3/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123_4/454.zip +/Volumes/Elements/AITest/test/11/123_4/454的副本.exe +/Volumes/Elements/AITest/test/11/123_5/454.zip +/Volumes/Elements/AITest/test/11/123_5/454的副本.exe +/Volumes/Elements/AITest/test/11/123-1/454.zip +/Volumes/Elements/AITest/test/11/123-1/454的副本.exe +/Volumes/Elements/AITest/test/11/123_6/454.zip +/Volumes/Elements/AITest/test/11/123_6/454的副本.exe +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123_8/123/454.zip +/Volumes/Elements/AITest/test/11/123_8/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123/454.zip +/Volumes/Elements/AITest/test/11/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123_1/123/454.zip +/Volumes/Elements/AITest/test/11/123_1/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123_2/123/454.zip +/Volumes/Elements/AITest/test/11/123_2/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123_3/123/454.zip +/Volumes/Elements/AITest/test/11/123_3/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123_4/454.zip +/Volumes/Elements/AITest/test/11/123_4/454的副本.exe +/Volumes/Elements/AITest/test/11/123_5/454.zip +/Volumes/Elements/AITest/test/11/123_5/454的副本.exe +/Volumes/Elements/AITest/test/11/123-1/454.zip +/Volumes/Elements/AITest/test/11/123-1/454的副本.exe +/Volumes/Elements/AITest/test/11/123_6/454.zip +/Volumes/Elements/AITest/test/11/123_6/454的副本.exe +/Volumes/Elements/AITest/test/11/123_7/123/454.zip +/Volumes/Elements/AITest/test/11/123_7/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123_1/123/454.zip +/Volumes/Elements/AITest/test/11/123_1/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123/123/454.zip +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123/454.zip +/Volumes/Elements/AITest/test/11/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/._123.zip +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123_1/454.zip +/Volumes/Elements/AITest/test/11/123_1/454的副本.exe +/Volumes/Elements/AITest/test/11/123_1/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_1/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_1/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/._123.zip +/Volumes/Elements/AITest/test/11/123/454.zip +/Volumes/Elements/AITest/test/11/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/._123.zip +/Volumes/Elements/AITest/test/11/123/454.zip +/Volumes/Elements/AITest/test/11/123/454的副本.exe +/Volumes/Elements/AITest/test/11/123/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_1/454.zip +/Volumes/Elements/AITest/test/11/123_1/454的副本.exe +/Volumes/Elements/AITest/test/11/123_1/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_1/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_1/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/._123.zip +/Volumes/Elements/AITest/test/11/123_4/454的副本.exe +/Volumes/Elements/AITest/test/11/123_4/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_4/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_4/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123_3/123/454.zip +/Volumes/Elements/AITest/test/11/123_3/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123_3/123/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_3/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_3/__MACOSX/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/._123.zip +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123_4/123/454.zip +/Volumes/Elements/AITest/test/11/._123.zip +/Volumes/Elements/AITest/test/11/123_3/123/454.zip +/Volumes/Elements/AITest/test/11/123_3/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123_3/123/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_3/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_3/__MACOSX/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123_6/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/._123.zip +/Volumes/Elements/AITest/test/11/123_3/123/454.zip +/Volumes/Elements/AITest/test/11/123_3/123/454_3/Volumes/Elements/AITest/test/11/123_3/123/454.zip +/Volumes/Elements/AITest/test/11/123_3/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123_3/123/454τÜäσ뻵£¼_1/Volumes/Elements/AITest/test/11/123_3/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123_3/123/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_3/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_3/__MACOSX/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_4/123/454.zip +/Volumes/Elements/AITest/test/11/123_4/123/454_2/Volumes/Elements/AITest/test/11/123_4/123/454.zip +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123_7/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/._123.zip +/Volumes/Elements/AITest/test/11/123_3/123/454.zip +/Volumes/Elements/AITest/test/11/123_3/123/454_4/Volumes/Elements/AITest/test/11/123_3/123/454.zip +/Volumes/Elements/AITest/test/11/123_3/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123_3/123/454τÜäσ뻵£¼_2/Volumes/Elements/AITest/test/11/123_3/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123_3/123/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_3/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_3/123/454_3/Volumes/Elements/AITest/test/11/123_3/123/454.zip +/Volumes/Elements/AITest/test/11/123_3/123/454τÜäσ뻵£¼_1/Volumes/Elements/AITest/test/11/123_3/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123_3/__MACOSX/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_4/123/454.zip +/Volumes/Elements/AITest/test/11/123_4/123/454_3/Volumes/Elements/AITest/test/11/123_4/123/454.zip +/Volumes/Elements/AITest/test/11/123_4/123/454_2/Volumes/Elements/AITest/test/11/123_4/123/454.zip +/Volumes/Elements/AITest/test/11/123_6/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123_8/454.zip +/Volumes/Elements/AITest/test/11/123_8/454的副本.exe +/Volumes/Elements/AITest/test/11/123_8/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_8/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_8/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/._123.zip +/Volumes/Elements/AITest/test/11/123_3/123/454.zip +/Volumes/Elements/AITest/test/11/123_3/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123_3/123/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_3/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_3/123/454_3/Volumes/Elements/AITest/test/11/123_3/123/454.zip +/Volumes/Elements/AITest/test/11/123_3/123/454τÜäσ뻵£¼_1/Volumes/Elements/AITest/test/11/123_3/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123_3/123/454_4/Volumes/Elements/AITest/test/11/123_3/123/454.zip +/Volumes/Elements/AITest/test/11/123_3/123/454τÜäσ뻵£¼_2/Volumes/Elements/AITest/test/11/123_3/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123_3/__MACOSX/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_4/123/454.zip +/Volumes/Elements/AITest/test/11/123_4/123/454_2/Volumes/Elements/AITest/test/11/123_4/123/454.zip +/Volumes/Elements/AITest/test/11/123_4/123/454_3/Volumes/Elements/AITest/test/11/123_4/123/454.zip +/Volumes/Elements/AITest/test/11/123_6/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123_7/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123_10/454.zip +/Volumes/Elements/AITest/test/11/123_10/454的副本.exe +/Volumes/Elements/AITest/test/11/123_10/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_10/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/._123.zip +/Volumes/Elements/AITest/test/11/123_3/123/454.zip +/Volumes/Elements/AITest/test/11/123_3/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123_3/123/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_3/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_3/123/454_3/Volumes/Elements/AITest/test/11/123_3/123/454.zip +/Volumes/Elements/AITest/test/11/123_3/123/454τÜäσ뻵£¼_1/Volumes/Elements/AITest/test/11/123_3/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123_3/123/454_4/Volumes/Elements/AITest/test/11/123_3/123/454.zip +/Volumes/Elements/AITest/test/11/123_3/123/454τÜäσ뻵£¼_2/Volumes/Elements/AITest/test/11/123_3/123/454τÜäσ뻵£¼.exe +/Volumes/Elements/AITest/test/11/123_3/__MACOSX/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_4/123/454.zip +/Volumes/Elements/AITest/test/11/123_4/123/454_2/Volumes/Elements/AITest/test/11/123_4/123/454.zip +/Volumes/Elements/AITest/test/11/123_4/123/454_3/Volumes/Elements/AITest/test/11/123_4/123/454.zip +/Volumes/Elements/AITest/test/11/123_6/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123_7/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123_8/454.zip +/Volumes/Elements/AITest/test/11/123_8/454的副本.exe +/Volumes/Elements/AITest/test/11/123_8/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_8/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_8/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_9/454.zip +/Volumes/Elements/AITest/test/11/123_9/454的副本.exe +/Volumes/Elements/AITest/test/11/123_9/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123_9/123/._cli-arm64.exe +/Volumes/Elements/AITest/test/11/123-1/454.zip +/Volumes/Elements/AITest/test/11/123-1/454的副本.exe +/Volumes/Elements/AITest/test/11/123-1/cli-arm64.exe +/Volumes/Elements/AITest/test/11/123-1/._cli-arm64.exe +/Volumes/Elements/AITest/test/1256/20221224.zip +/Volumes/Elements/AITest/test/1256/._20221224.zip +/Volumes/Elements/AITest/test/1256/20221224.zip +/Volumes/Elements/AITest/test/1256/._20221224.zip +/Volumes/Elements/AITest/test/1256/20221224.zip +/Volumes/Elements/AITest/test/1256/._20221224.zip +/Volumes/Elements/AITest/test/1256/20221224.zip +/Volumes/Elements/AITest/test/1256/._20221224.zip +/Volumes/Elements/AITest/test/1256/20221224.zip +/Volumes/Elements/AITest/test/1256/._20221224.zip +/Volumes/Elements/AITest/test/1256/20221224.zip +/Volumes/Elements/AITest/test/1256/20221224.zip +/Volumes/Elements/AITest/test/1256/._20221224.zip From 908c0b9fb089f2d4b4c602a94be522960795ae8b Mon Sep 17 00:00:00 2001 From: xiexu <1019362424@qq.com> Date: Mon, 25 Dec 2023 05:19:06 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E5=8E=BB=E6=8E=89=E5=BE=AA=E7=8E=AF?= =?UTF-8?q?=E8=A7=A3=E5=8E=8B=E7=BC=A9=E7=9A=84=E9=80=BB=E8=BE=91=EF=BC=8C?= =?UTF-8?q?=E8=A7=A3=E5=8E=8B=E7=BC=A9=E6=88=90=E5=8A=9F=E5=90=8E=E5=88=A0?= =?UTF-8?q?=E9=99=A4=E5=8E=9F=E6=96=87=E4=BB=B6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .vscode/launch.json | 2 +- corpus_processing/extract.py | 220 +++++++++++++++++++++-------------- tobereomve.txt | 3 + 3 files changed, 134 insertions(+), 91 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 890202a..ae90f35 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -7,7 +7,7 @@ "type": "python", "request": "launch", "program": "${workspaceFolder}/corpus_processing/extract.py", - "args": ["--folder_path","/Volumes/Elements/AITest/test/1256/", "--passwords_files","/Volumes/Elements/AITest/test/README_1.txt"], + "args": ["--folder_path","/Volumes/Elements/AITest/test/11/123", "--passwords_files","/Volumes/Elements/AITest/test/README_1.txt"], // "args": ["--src_dir", "/Volumes/Elements/AITest/testlog.zip", "--dst_dir", "/Volumes/Elements/AITest/AITest", "--n_process", "4", "--threshold", "0.7"], "console": "integratedTerminal" } diff --git a/corpus_processing/extract.py b/corpus_processing/extract.py index 9fa42fa..3b0a146 100644 --- a/corpus_processing/extract.py +++ b/corpus_processing/extract.py @@ -1,6 +1,6 @@ import os, hashlib import argparse -import shutil, io +import shutil import tarfile import zipfile import bz2 @@ -9,7 +9,6 @@ import py7zr import os, sys from charset_mnbvc import api -from better_zipfile import fixcharset_zipfile def get_extension(file_path): filename, extension = os.path.splitext(file_path) @@ -36,7 +35,7 @@ def test_encode(file_path : bytes): pass return None -def check_long_name(extract_full_path, zip_file_name):# longname返回true +def check_long_name(extract_full_path, zip_file_name): paths = zip_file_name.split('/') file_name = paths[-1] if len(file_name.encode()) > 255 and len(os.path.join(extract_full_path, zip_file_name).encode()) < 4095: @@ -45,7 +44,7 @@ def check_long_name(extract_full_path, zip_file_name):# longname返回true length = (255-len(extensions.encode())-8)//2 basename = basename.encode()[:length].decode('utf-8', errors='ignore')+hashlib.md5(file_name.encode()).hexdigest()[:8]+basename.encode()[-length:].decode('utf-8', errors='ignore') new_name = basename + extensions - return os.path.join(extract_full_path, '/'.join(paths[:-1]), new_name), True + return os.path.join(extract_full_path, '/'.join(paths[:-1]), new_name) elif any(len(path.encode()) > 255 for path in paths) or len(os.path.join(extract_full_path, zip_file_name).encode()) > 4095: print(f"File name too long: \n {os.path.join(extract_full_path, zip_file_name)} \n") @@ -53,60 +52,11 @@ def check_long_name(extract_full_path, zip_file_name):# longname返回true new_name = zip_file_name.encode()[:length//2-1].decode('utf-8', errors='ignore') +hashlib.md5(zip_file_name.encode()).hexdigest()[:8]+ zip_file_name.encode()[1-length//2:].decode('utf-8', errors='ignore') new_name = '_'.join(new_name.split('/')) - return os.path.join(extract_full_path, 'long_name', new_name), True - - return os.path.join(extract_full_path, zip_file_name), False - - -def extract_zip(file, password, extract_full_path): - try: - with zipfile.ZipFile(file, 'r') as zip: - zip.setpassword(password) - for file_info in zip.infolist(): - file = file_info.filename - if file.endswith('/') or file.startswith('__MACOSX') : - continue - - try: - file_bytes = file.encode('cp437') - except: - file_bytes = file.encode('utf-8') - - coding_name = test_encode(file_bytes) - - if coding_name is None: - coding_name = api.from_data(file_bytes, mode=2) - - utf8_name = api.convert_encoding( - source_data=file_bytes, - source_encoding=coding_name, - target_encoding="utf-8", - ) - new_string = utf8_name.split('/', 1)[1] - new_file_path ,_ = check_long_name(extract_full_path, new_string) - basename = os.path.dirname(new_file_path) - - os.makedirs(basename, exist_ok=True) - # 复制文件 - source = zip.open(file_info) - with open(new_file_path, "wb") as target: - shutil.copyfileobj(source, target) - print("解压缩完成") - extract_succcessful = True - except zipfile.BadZipFile: - print("错误:无效的ZIP文件。") - extract_succcessful = False - except FileNotFoundError: - print("错误:文件未找到。") - extract_succcessful = False - except IOError as e: - print(f"输入/输出错误:{e}") - extract_succcessful = False - except Exception as e: - print(f"发生了一个未知错误:{e}") - extract_succcessful = False - return extract_succcessful - + return os.path.join(extract_full_path, 'long_name', new_name) + + zipfileName = remove_between_first_second_slash( zip_file_name) + fullPath = os.path.join(extract_full_path, zipfileName) + return fullPath def extract_archive(file_path, extract_full_path, file, password=None): @@ -116,51 +66,33 @@ def extract_archive(file_path, extract_full_path, file, password=None): try: if extension == '.tar': with tarfile.open(file_path, 'r') as tar: - tar.extractall(extract_full_path) + extract_and_convert_zip(tar,extract_full_path) + os.remove(file_path) elif extension == '.tbz2' or extension == '.tar.bz2': with tarfile.open(file_path, 'r:bz2') as tar: tar.extractall(extract_full_path) + os.remove(file_path) elif extension == '.tgz' or extension == '.tar.gz' or extension == '.tar.Z': with tarfile.open(file_path, 'r:gz') as tar: tar.extractall(extract_full_path) + os.remove(file_path) elif extension == '.tar.xz': with tarfile.open(file_path, 'r:xz') as tar: tar.extractall(extract_full_path) + os.remove(file_path) elif extension == '.bz2': if not os.path.exists(extract_full_path): os.mkdir(extract_full_path) with bz2.open(file_path, 'rb') as f_in: with open(os.path.join(extract_full_path, filename), 'wb') as f_out: shutil.copyfileobj(f_in, f_out) + os.remove(file_path) elif extension == '.rar': with rarfile.RarFile(file_path, 'r') as rar: rar.setpassword(password) - - problem = False - - for file in rar.namelist(): - if file.endswith('/'): - continue - new_file_path, if_long_name = check_long_name(extract_full_path, file) - if if_long_name: - problem = True - break - - if problem: - for file in rar.namelist(): - if file.endswith('/'): - continue - new_file_path, _ = check_long_name(extract_full_path, file) - basename = os.path.dirname(new_file_path) - - os.makedirs(basename, exist_ok=True) - with rar.open(file, 'r') as f_in: - data = f_in.read() - with open(new_file_path, 'wb') as f_out: - f_out.write(data) - # print(f"File extract to: {new_file_path}") - else: - rar.extractall(extract_full_path) + extract_and_convert_zip(rar,extract_full_path) + os.remove(file_path) + elif extension == '.gz': if not os.path.exists(extract_full_path): @@ -169,9 +101,55 @@ def extract_archive(file_path, extract_full_path, file, password=None): with gzip.open(file_path, 'rb') as f_in: with open(os.path.join(extract_full_path, filename), 'wb') as f_out: shutil.copyfileobj(f_in, f_out) + os.remove(file_path) elif extension in ('.zip', '.exe'): - extract_succcessful = extract_zip(file_path, password, extract_full_path) - + try: + with zipfile.ZipFile(file_path, 'r') as zip: + zip.setpassword(password) + extract_and_convert_zip(zip,extract_full_path) + # for file_info in zip.infolist(): + # file = file_info.filename + # if file.endswith('/') or file.startswith('__MACOSX/') or is_macos_metadata(file): + # continue + + # try: + # file_bytes = file.encode('cp437') + # except: + # file_bytes = file.encode('utf-8') + + # coding_name = test_encode(file_bytes) + + # if coding_name is None: + # coding_name = api.from_data(file_bytes, mode=2) + + # utf8_name = api.convert_encoding( + # source_data=file_bytes, + # source_encoding=coding_name, + # target_encoding="utf-8", + # ) + + # new_file_path = check_long_name(extract_full_path, utf8_name) + # basename = os.path.dirname(new_file_path) + # os.makedirs(basename, exist_ok=True) + # # 复制文件 + # source = zip.open(file_info) + # with open(new_file_path, "wb") as target: + # shutil.copyfileobj(source, target) + + os.remove(file_path) + print("解压缩完成") + except zipfile.BadZipFile: + print("错误:无效的ZIP文件。") + extract_succcessful = False + except FileNotFoundError: + print("错误:文件未找到。") + extract_succcessful = False + except IOError as e: + print(f"输入/输出错误:{e}") + extract_succcessful = False + except Exception as e: + print(f"发生了一个未知错误:{e}") + extract_succcessful = False elif extension == '.7z': with py7zr.SevenZipFile(file_path, mode='r', password=password) as seven_zip: seven_zip.extractall(extract_full_path) @@ -205,7 +183,8 @@ def traverse_directory(folder_path, passwords=None): for root, dirs, files in os.walk(folder_path): extract_path_set = set(dirs) - + if root != folder_path: + continue for file in files: # 判断文件是否为压缩包类型 if file.endswith(('.tar', '.tbz2', '.tgz', '.tar.bz2', '.tar.gz', '.tar.xz', '.tar.Z', '.bz2', '.rar', '.gz', '.zip', '.xz', '.7z', '.exe')): @@ -234,12 +213,73 @@ def traverse_directory(folder_path, passwords=None): if extract_succcessful: break - if extract_succcessful: - traverse_directory(extract_full_path) + # if extract_succcessful: + # traverse_directory(extract_full_path) extract_path_set.add(extract_path) +def remove_between_first_second_slash(path): + """ + Remove the substring from the first '/' to the second '/' in the path, inclusive. + + :param path: The original path as a string. + :return: The path after removing the substring between the first and second '/'. + """ + first_slash_index = path.find('/') + if first_slash_index == -1: + return path # No '/' found + + + return path[first_slash_index + 1:] + +def is_macos_metadata(path): + """ + Check if the given path is a macOS metadata file. + + :param path: The path to be checked. + :return: True if the path is a macOS metadata file, False otherwise. + """ + return path.split('/')[-1].startswith('._') + + +def extract_and_convert_zip(zip_file, extract_full_path): + """ + Extract files from the zip file, convert filenames to UTF-8, and save them to a target directory. + + :param zip_file: ZipFile object to be extracted. + """ + for file_info in zip_file.infolist(): + file = file_info.filename + + if file.endswith('/') or file.startswith('__MACOSX/') or is_macos_metadata(file): + continue + + try: + file_bytes = file.encode('cp437') + except: + file_bytes = file.encode('utf-8') + + coding_name = test_encode(file_bytes) + + if coding_name is None: + coding_name = api.from_data(file_bytes, mode=2) + + utf8_name = api.convert_encoding( + source_data=file_bytes, + source_encoding=coding_name, + target_encoding="utf-8", + ) + + new_file_path = check_long_name(extract_full_path, utf8_name) + basename = os.path.dirname(new_file_path) + os.makedirs(basename, exist_ok=True) + + # 复制文件 + with zip_file.open(file_info) as source: + with open(new_file_path, "wb") as target: + shutil.copyfileobj(source, target) + if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--folder_path', type=str, required=True, help="压缩包路径") diff --git a/tobereomve.txt b/tobereomve.txt index bbd1986..b77b80c 100644 --- a/tobereomve.txt +++ b/tobereomve.txt @@ -287,3 +287,6 @@ /Volumes/Elements/AITest/test/1256/20221224.zip /Volumes/Elements/AITest/test/1256/20221224.zip /Volumes/Elements/AITest/test/1256/._20221224.zip +/Volumes/Elements/AITest/test/11/123.zip +/Volumes/Elements/AITest/test/11/123/454.zip +/Volumes/Elements/AITest/test/11/123/454的副本.exe