Skip to content

Commit

Permalink
Merge pull request #93 from cosmicexplorer/bulk-parsing
Browse files Browse the repository at this point in the history
perf: parse headers in blocks and scan for magic numbers with memchr
  • Loading branch information
Pr0methean committed May 25, 2024
2 parents 294564c + a28b16e commit b057d0d
Show file tree
Hide file tree
Showing 10 changed files with 1,385 additions and 521 deletions.
3 changes: 2 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ displaydoc = { version = "0.2.4", default-features = false }
flate2 = { version = "1.0.28", default-features = false, optional = true }
indexmap = "2"
hmac = { version = "0.12.1", optional = true, features = ["reset"] }
memchr = "2.7.2"
pbkdf2 = { version = "0.12.2", optional = true }
rand = { version = "0.8.5", optional = true }
sha1 = { version = "0.10.6", optional = true }
Expand All @@ -56,7 +57,7 @@ arbitrary = { version = "1.3.2", features = ["derive"] }

[dev-dependencies]
bencher = "0.1.5"
getrandom = { version = "0.2.14", features = ["js"] }
getrandom = { version = "0.2.14", features = ["js", "std"] }
walkdir = "2.5.0"
time = { workspace = true, features = ["formatting", "macros"] }
anyhow = "1"
Expand Down
106 changes: 97 additions & 9 deletions benches/read_metadata.rs
Original file line number Diff line number Diff line change
@@ -1,38 +1,126 @@
use bencher::{benchmark_group, benchmark_main};

use std::io::{Cursor, Write};
use std::fs;
use std::io::{self, prelude::*, Cursor};

use bencher::Bencher;
use getrandom::getrandom;
use tempdir::TempDir;
use zip::write::SimpleFileOptions;
use zip::{CompressionMethod, ZipArchive, ZipWriter};
use zip::{result::ZipResult, CompressionMethod, ZipArchive, ZipWriter};

const FILE_COUNT: usize = 15_000;
const FILE_SIZE: usize = 1024;

fn generate_random_archive(count_files: usize, file_size: usize) -> Vec<u8> {
fn generate_random_archive(count_files: usize, file_size: usize) -> ZipResult<Vec<u8>> {
let data = Vec::new();
let mut writer = ZipWriter::new(Cursor::new(data));
let options = SimpleFileOptions::default().compression_method(CompressionMethod::Stored);

let bytes = vec![0u8; file_size];
let mut bytes = vec![0u8; file_size];

for i in 0..count_files {
let name = format!("file_deadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef_{i}.dat");
writer.start_file(name, options).unwrap();
writer.write_all(&bytes).unwrap();
writer.start_file(name, options)?;
getrandom(&mut bytes).map_err(io::Error::from)?;
writer.write_all(&bytes)?;
}

writer.finish().unwrap().into_inner()
Ok(writer.finish()?.into_inner())
}

fn read_metadata(bench: &mut Bencher) {
let bytes = generate_random_archive(FILE_COUNT, FILE_SIZE);
let bytes = generate_random_archive(FILE_COUNT, FILE_SIZE).unwrap();

bench.iter(|| {
let archive = ZipArchive::new(Cursor::new(bytes.as_slice())).unwrap();
archive.len()
});
bench.bytes = bytes.len() as u64;
}

benchmark_group!(benches, read_metadata);
const COMMENT_SIZE: usize = 50_000;

fn generate_zip32_archive_with_random_comment(comment_length: usize) -> ZipResult<Vec<u8>> {
let data = Vec::new();
let mut writer = ZipWriter::new(Cursor::new(data));
let options = SimpleFileOptions::default().compression_method(CompressionMethod::Stored);

let mut bytes = vec![0u8; comment_length];
getrandom(&mut bytes).unwrap();
writer.set_raw_comment(bytes.into_boxed_slice());

writer.start_file("asdf.txt", options)?;
writer.write_all(b"asdf")?;

Ok(writer.finish()?.into_inner())
}

fn parse_archive_with_comment(bench: &mut Bencher) {
let bytes = generate_zip32_archive_with_random_comment(COMMENT_SIZE).unwrap();

bench.bench_n(1, |_| {
let archive = ZipArchive::new(Cursor::new(bytes.as_slice())).unwrap();
let _ = archive.comment().len();
});
bench.bytes = bytes.len() as u64;
}

const COMMENT_SIZE_64: usize = 500_000;

fn generate_zip64_archive_with_random_comment(comment_length: usize) -> ZipResult<Vec<u8>> {
let data = Vec::new();
let mut writer = ZipWriter::new(Cursor::new(data));
let options = SimpleFileOptions::default()
.compression_method(CompressionMethod::Stored)
.large_file(true);

let mut bytes = vec![0u8; comment_length];
getrandom(&mut bytes).unwrap();
writer.set_raw_comment(bytes.into_boxed_slice());

writer.start_file("asdf.txt", options)?;
writer.write_all(b"asdf")?;

Ok(writer.finish()?.into_inner())
}

fn parse_zip64_archive_with_comment(bench: &mut Bencher) {
let bytes = generate_zip64_archive_with_random_comment(COMMENT_SIZE_64).unwrap();

bench.iter(|| {
let archive = ZipArchive::new(Cursor::new(bytes.as_slice())).unwrap();
archive.comment().len()
});
bench.bytes = bytes.len() as u64;
}

fn parse_stream_archive(bench: &mut Bencher) {
const STREAM_ZIP_ENTRIES: usize = 5;
const STREAM_FILE_SIZE: usize = 5;

let bytes = generate_random_archive(STREAM_ZIP_ENTRIES, STREAM_FILE_SIZE).unwrap();

/* Write to a temporary file path to incur some filesystem overhead from repeated reads */
let dir = TempDir::new("stream-bench").unwrap();
let out = dir.path().join("bench-out.zip");
fs::write(&out, &bytes).unwrap();

bench.iter(|| {
let mut f = fs::File::open(&out).unwrap();
while zip::read::read_zipfile_from_stream(&mut f)
.unwrap()
.is_some()
{}
});
bench.bytes = bytes.len() as u64;
}

benchmark_group!(
benches,
read_metadata,
parse_archive_with_comment,
parse_zip64_archive_with_comment,
parse_stream_archive,
);
benchmark_main!(benches);
47 changes: 24 additions & 23 deletions src/compression.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,7 @@ impl CompressionMethod {
pub const AES: Self = CompressionMethod::Unsupported(99);
}
impl CompressionMethod {
/// Converts an u16 to its corresponding CompressionMethod
#[deprecated(
since = "0.5.7",
note = "use a constant to construct a compression method"
)]
pub const fn from_u16(val: u16) -> CompressionMethod {
#[allow(deprecated)]
pub(crate) const fn parse_from_u16(val: u16) -> Self {
match val {
0 => CompressionMethod::Stored,
#[cfg(feature = "_deflate-any")]
Expand All @@ -111,18 +105,21 @@ impl CompressionMethod {
93 => CompressionMethod::Zstd,
#[cfg(feature = "aes-crypto")]
99 => CompressionMethod::Aes,

#[allow(deprecated)]
v => CompressionMethod::Unsupported(v),
}
}

/// Converts a CompressionMethod to a u16
/// Converts a u16 to its corresponding CompressionMethod
#[deprecated(
since = "0.5.7",
note = "to match on other compression methods, use a constant"
note = "use a constant to construct a compression method"
)]
pub const fn to_u16(self) -> u16 {
#[allow(deprecated)]
pub const fn from_u16(val: u16) -> CompressionMethod {
Self::parse_from_u16(val)
}

pub(crate) const fn serialize_to_u16(self) -> u16 {
match self {
CompressionMethod::Stored => 0,
#[cfg(feature = "_deflate-any")]
Expand All @@ -137,10 +134,19 @@ impl CompressionMethod {
CompressionMethod::Zstd => 93,
#[cfg(feature = "lzma")]
CompressionMethod::Lzma => 14,

#[allow(deprecated)]
CompressionMethod::Unsupported(v) => v,
}
}

/// Converts a CompressionMethod to a u16
#[deprecated(
since = "0.5.7",
note = "to match on other compression methods, use a constant"
)]
pub const fn to_u16(self) -> u16 {
self.serialize_to_u16()
}
}

impl Default for CompressionMethod {
Expand Down Expand Up @@ -180,23 +186,18 @@ mod test {
#[test]
fn from_eq_to() {
for v in 0..(u16::MAX as u32 + 1) {
#[allow(deprecated)]
let from = CompressionMethod::from_u16(v as u16);
#[allow(deprecated)]
let to = from.to_u16() as u32;
let from = CompressionMethod::parse_from_u16(v as u16);
let to = from.serialize_to_u16() as u32;
assert_eq!(v, to);
}
}

#[test]
fn to_eq_from() {
fn check_match(method: CompressionMethod) {
#[allow(deprecated)]
let to = method.to_u16();
#[allow(deprecated)]
let from = CompressionMethod::from_u16(to);
#[allow(deprecated)]
let back = from.to_u16();
let to = method.serialize_to_u16();
let from = CompressionMethod::parse_from_u16(to);
let back = from.serialize_to_u16();
assert_eq!(to, back);
}

Expand Down
Loading

0 comments on commit b057d0d

Please sign in to comment.