diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..065dcdf --- /dev/null +++ b/.clang-format @@ -0,0 +1,225 @@ +--- +Language: Cpp +# BasedOnStyle: Mozilla +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignArrayOfStructures: None +AlignConsecutiveAssignments: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + PadOperators: true +AlignConsecutiveBitFields: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + PadOperators: false +AlignConsecutiveDeclarations: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + PadOperators: false +AlignConsecutiveMacros: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + PadOperators: false +AlignEscapedNewlines: Right +AlignOperands: Align +AlignTrailingComments: + Kind: Always + OverEmptyLines: 0 +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortEnumsOnASingleLine: true +AllowShortFunctionsOnASingleLine: Inline +AllowShortIfStatementsOnASingleLine: Never +AllowShortLambdasOnASingleLine: All +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: TopLevel +AlwaysBreakAfterReturnType: TopLevel +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: Yes +AttributeMacros: + - __capability +BinPackArguments: false +BinPackParameters: false +BitFieldColonSpacing: Both +BraceWrapping: + AfterCaseLabel: false + AfterClass: true + AfterControlStatement: Never + AfterEnum: true + AfterExternBlock: true + AfterFunction: true + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: true + AfterUnion: true + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: false + SplitEmptyNamespace: true +BreakAfterAttributes: Never +BreakAfterJavaFieldAnnotations: false +BreakArrays: true +BreakBeforeBinaryOperators: None +BreakBeforeConceptDeclarations: Always +BreakBeforeBraces: Mozilla +BreakBeforeInlineASMColon: OnlyMultiline +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: BeforeComma +BreakInheritanceList: BeforeComma +BreakStringLiterals: true +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerIndentWidth: 2 +ContinuationIndentWidth: 2 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +EmptyLineAfterAccessModifier: Never +EmptyLineBeforeAccessModifier: LogicalBlock +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: false +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IfMacros: + - KJ_IF_MAYBE +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + Priority: 2 + SortPriority: 0 + CaseSensitive: false + - Regex: '^(<|"(gtest|gmock|isl|json)/)' + Priority: 3 + SortPriority: 0 + CaseSensitive: false + - Regex: '.*' + Priority: 1 + SortPriority: 0 + CaseSensitive: false +IncludeIsMainRegex: '(Test)?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false +IndentCaseBlocks: false +IndentCaseLabels: true +IndentExternBlock: AfterExternBlock +IndentGotoLabels: true +IndentPPDirectives: None +IndentRequiresClause: true +IndentWidth: 2 +IndentWrappedFunctionNames: false +InsertBraces: false +InsertNewlineAtEOF: false +InsertTrailingCommas: None +IntegerLiteralSeparator: + Binary: 0 + BinaryMinDigits: 0 + Decimal: 0 + DecimalMinDigits: 0 + Hex: 0 + HexMinDigits: 0 +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: true +LambdaBodyIndentation: Signature +LineEnding: DeriveLF +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 2 +ObjCBreakBeforeNestedBlockParam: true +ObjCSpaceAfterProperty: true +ObjCSpaceBeforeProtocolList: false +PackConstructorInitializers: BinPack +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakOpenParenthesis: 0 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyIndentedWhitespace: 0 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Left +PPIndentWidth: -1 +QualifierAlignment: Leave +ReferenceAlignment: Pointer +ReflowComments: true +RemoveBracesLLVM: false +RemoveSemicolon: false +RequiresClausePosition: OwnLine +RequiresExpressionIndentation: OuterScope +SeparateDefinitionBlocks: Leave +ShortNamespaceLines: 1 +SortIncludes: CaseSensitive +SortJavaStaticImport: Before +SortUsingDeclarations: LexicographicNumeric +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: false +SpaceAroundPointerQualifiers: Default +SpaceBeforeAssignmentOperators: true +SpaceBeforeCaseColon: false +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeParensOptions: + AfterControlStatements: true + AfterForeachMacros: true + AfterFunctionDefinitionName: false + AfterFunctionDeclarationName: false + AfterIfMacros: true + AfterOverloadedOperator: false + AfterRequiresInClause: false + AfterRequiresInExpression: false + BeforeNonEmptyParentheses: false +SpaceBeforeRangeBasedForLoopColon: true +SpaceBeforeSquareBrackets: false +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: Never +SpacesInConditionalStatement: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Latest +StatementAttributeLikeMacros: + - Q_EMIT +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 8 +UseTab: Never +WhitespaceSensitiveMacros: + - BOOST_PP_STRINGIZE + - CF_SWIFT_NAME + - NS_SWIFT_NAME + - PP_STRINGIZE + - STRINGIZE +... + diff --git a/.github/workflows/test_ci.yml b/.github/workflows/test_ci.yml index 0d5f544..8f32bcd 100644 --- a/.github/workflows/test_ci.yml +++ b/.github/workflows/test_ci.yml @@ -12,15 +12,11 @@ jobs: runs-on: ${{matrix.os}} strategy: matrix: - os: [ubuntu-latest, macos-latest] + # From https://github.com/actions/runner-images#available-images + os: [ubuntu-latest, macos-latest, flyci-macos-large-latest-m1, flyci-macos-large-latest-m2] steps: - uses: actions/checkout@v4 - # From https://github.com/marketplace/actions/actions-setup-cmake - - name: Setup CMake - uses: jwlawson/actions-setup-cmake@v1.13 - with: - cmake-version: 'latest' - name: Setup Google-Test run: | pushd ~ diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..4c07216 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "gtest-parallel"] + path = gtest-parallel + url = https://github.com/google/gtest-parallel diff --git a/Makefile b/Makefile index 38cfeac..af85178 100644 --- a/Makefile +++ b/Makefile @@ -35,6 +35,7 @@ BENCHMARK_LINK_FLAGS = -lbenchmark -lbenchmark_main BENCHMARK_BINARY = $(BENCHMARK_BUILD_DIR)/bench.out PERF_LINK_FLAGS = -lbenchmark -lbenchmark_main -lpfm PERF_BINARY = $(PERF_BUILD_DIR)/perf.out +GTEST_PARALLEL = ./gtest-parallel/gtest-parallel all: test @@ -53,6 +54,9 @@ $(BENCHMARK_BUILD_DIR): $(PERF_BUILD_DIR): mkdir -p $@ +$(GTEST_PARALLEL): + git submodule update --init + $(TEST_BUILD_DIR)/%.o: $(TEST_DIR)/%.cpp $(TEST_BUILD_DIR) $(CXX) $(CXX_FLAGS) $(WARN_FLAGS) $(OPT_FLAGS) $(I_FLAGS) -c $< -o $@ @@ -71,14 +75,14 @@ $(ASAN_TEST_BINARY): $(ASAN_TEST_OBJECTS) $(UBSAN_TEST_BINARY): $(UBSAN_TEST_OBJECTS) $(CXX) $(UBSAN_FLAGS) $^ $(TEST_LINK_FLAGS) -o $@ -test: $(TEST_BINARY) - ./$< --gtest_shuffle --gtest_random_seed=0 +test: $(TEST_BINARY) $(GTEST_PARALLEL) + $(GTEST_PARALLEL) $< --print_test_times -asan_test: $(ASAN_TEST_BINARY) - ./$< --gtest_shuffle --gtest_random_seed=0 +asan_test: $(ASAN_TEST_BINARY) $(GTEST_PARALLEL) + $(GTEST_PARALLEL) $< --print_test_times -ubsan_test: $(UBSAN_TEST_BINARY) - ./$< --gtest_shuffle --gtest_random_seed=0 +ubsan_test: $(UBSAN_TEST_BINARY) $(GTEST_PARALLEL) + $(GTEST_PARALLEL) $< --print_test_times $(BENCHMARK_BUILD_DIR)/%.o: $(BENCHMARK_DIR)/%.cpp $(BENCHMARK_BUILD_DIR) $(CXX) $(CXX_FLAGS) $(WARN_FLAGS) $(OPT_FLAGS) $(I_FLAGS) -c $< -o $@ @@ -106,4 +110,4 @@ clean: rm -rf $(BUILD_DIR) format: $(SHA3_SOURCES) $(TEST_SOURCES) $(BENCHMARK_SOURCES) - clang-format -i --style=Mozilla $^ + clang-format -i $^ diff --git a/README.md b/README.md index c87ccb6..f5b41b3 100644 --- a/README.md +++ b/README.md @@ -62,15 +62,16 @@ cmake version 3.25.1 - For testing SHA3 algorithms, you need to globally install `google-test` library and headers. Follow [this](https://github.com/google/googletest/tree/main/googletest#standalone-cmake-project) guide if you haven't installed it yet. - For benchmarking SHA3 algorithms, targeting CPU systems, `google-benchmark` library and headers are required to be installed system-wide. Follow [this](https://github.com/google/benchmark#installation) guide if you don't have it installed yet. -- If you are on a machine running GNU/Linux kernel and you want to obtain following (see list below), when benchmarking SHA3 algorithms, you should consider building `google-benchmark` library yourself with libPFM support, following [this](https://gist.github.com/itzmeanjan/05dc3e946f635d00c5e0b21aae6203a7) step-by-step guide. Find more about libPFM @ https://perfmon2.sourceforge.net. - 1) CPU cycle count. - 2) Retired instruction count. - 3) Cycles/ byte ( aka cpb ). - 4) Retired instructions/ cycle ( aka ipc ). + +> [!NOTE] +> If you are on a machine running GNU/Linux kernel and you want to obtain CPU cycles or Cycles/ byte or instruction/ cycle etc., when benchmarking SHA3 algorithms, you should consider building `google-benchmark` library yourself with `libPFM` support, following the step-by-step guide @ https://gist.github.com/itzmeanjan/05dc3e946f635d00c5e0b21aae6203a7. Find more about libPFM @ https://perfmon2.sourceforge.net. + +> [!TIP] +> Git submodule based dependencies will generally be imported automatically, but in case that doesn't work, you can manually bring them in by issuing `$ git submodule update --init` from inside the root of this repository. ## Testing -For ensuring that SHA3 hash function and extendable output function implementations are correct & conformant to the NIST standard ( see [this](https://dx.doi.org/10.6028/NIST.FIPS.202) ), I make use of K(nown) A(nswer) T(ests), generated following [this](https://gist.github.com/itzmeanjan/448f97f9c49d781a5eb3ddd6ea6e7364) gist. +For ensuring that SHA3 hash function and extendable output function implementations are correct & conformant to the NIST standard ( see https://dx.doi.org/10.6028/NIST.FIPS.202 ), I make use of K(nown) A(nswer) T(ests), generated following the gist @ https://gist.github.com/itzmeanjan/448f97f9c49d781a5eb3ddd6ea6e7364. I also test correctness of @@ -82,58 +83,32 @@ Some compile-time executed tests ( using `static_assert` ) are also implemented, Issue following command for running all the test cases. ```bash -make -j +make -j # Run tests without any sort of sanitizers +make asan_test -j # Run tests with AddressSanitizer enabled +make ubsan_test -j # Run tests with UndefinedBehaviourSanitizer enabled ``` ```bash -Note: Randomizing tests' orders with a seed of 54033 . -[==========] Running 18 tests from 2 test suites. -[----------] Global test environment set-up. -[----------] 6 tests from Sha3Xof -[ RUN ] Sha3Xof.Shake256KnownAnswerTests -[ OK ] Sha3Xof.Shake256KnownAnswerTests (2 ms) -[ RUN ] Sha3Xof.CompileTimeEvalShake256 -[ OK ] Sha3Xof.CompileTimeEvalShake256 (0 ms) -[ RUN ] Sha3Xof.Shake128KnownAnswerTests -[ OK ] Sha3Xof.Shake128KnownAnswerTests (2 ms) -[ RUN ] Sha3Xof.CompileTimeEvalShake128 -[ OK ] Sha3Xof.CompileTimeEvalShake128 (0 ms) -[ RUN ] Sha3Xof.Shake128IncrementalAbsorptionAndSqueezing -[ OK ] Sha3Xof.Shake128IncrementalAbsorptionAndSqueezing (915 ms) -[ RUN ] Sha3Xof.Shake256IncrementalAbsorptionAndSqueezing -[ OK ] Sha3Xof.Shake256IncrementalAbsorptionAndSqueezing (994 ms) -[----------] 6 tests from Sha3Xof (1916 ms total) - -[----------] 12 tests from Sha3Hashing -[ RUN ] Sha3Hashing.Sha3_384KnownAnswerTests -[ OK ] Sha3Hashing.Sha3_384KnownAnswerTests (2 ms) -[ RUN ] Sha3Hashing.Sha3_224KnownAnswerTests -[ OK ] Sha3Hashing.Sha3_224KnownAnswerTests (1 ms) -[ RUN ] Sha3Hashing.Sha3_256KnownAnswerTests -[ OK ] Sha3Hashing.Sha3_256KnownAnswerTests (2 ms) -[ RUN ] Sha3Hashing.CompileTimeEvalSha3_384 -[ OK ] Sha3Hashing.CompileTimeEvalSha3_384 (0 ms) -[ RUN ] Sha3Hashing.CompileTimeEvalSha3_224 -[ OK ] Sha3Hashing.CompileTimeEvalSha3_224 (0 ms) -[ RUN ] Sha3Hashing.Sha3_224IncrementalAbsorption -[ OK ] Sha3Hashing.Sha3_224IncrementalAbsorption (1 ms) -[ RUN ] Sha3Hashing.Sha3_384IncrementalAbsorption -[ OK ] Sha3Hashing.Sha3_384IncrementalAbsorption (1 ms) -[ RUN ] Sha3Hashing.Sha3_512IncrementalAbsorption -[ OK ] Sha3Hashing.Sha3_512IncrementalAbsorption (2 ms) -[ RUN ] Sha3Hashing.Sha3_256IncrementalAbsorption -[ OK ] Sha3Hashing.Sha3_256IncrementalAbsorption (1 ms) -[ RUN ] Sha3Hashing.CompileTimeEvalSha3_512 -[ OK ] Sha3Hashing.CompileTimeEvalSha3_512 (0 ms) -[ RUN ] Sha3Hashing.CompileTimeEvalSha3_256 -[ OK ] Sha3Hashing.CompileTimeEvalSha3_256 (0 ms) -[ RUN ] Sha3Hashing.Sha3_512KnownAnswerTests -[ OK ] Sha3Hashing.Sha3_512KnownAnswerTests (2 ms) -[----------] 12 tests from Sha3Hashing (16 ms total) - -[----------] Global test environment tear-down -[==========] 18 tests from 2 test suites ran. (1932 ms total) -[ PASSED ] 18 tests. +[18/18] Sha3Xof.Shake256IncrementalAbsorptionAndSqueezing (1149 ms) +PASSED TESTS (18/18): + 1 ms: build/tests/test.out Sha3Hashing.CompileTimeEvalSha3_512 + 1 ms: build/tests/test.out Sha3Xof.CompileTimeEvalShake128 + 1 ms: build/tests/test.out Sha3Hashing.CompileTimeEvalSha3_384 + 1 ms: build/tests/test.out Sha3Xof.CompileTimeEvalShake256 + 2 ms: build/tests/test.out Sha3Hashing.CompileTimeEvalSha3_224 + 2 ms: build/tests/test.out Sha3Hashing.CompileTimeEvalSha3_256 + 5 ms: build/tests/test.out Sha3Hashing.Sha3_224KnownAnswerTests + 5 ms: build/tests/test.out Sha3Hashing.Sha3_256KnownAnswerTests + 6 ms: build/tests/test.out Sha3Hashing.Sha3_512KnownAnswerTests + 6 ms: build/tests/test.out Sha3Hashing.Sha3_512IncrementalAbsorption + 6 ms: build/tests/test.out Sha3Hashing.Sha3_384IncrementalAbsorption + 6 ms: build/tests/test.out Sha3Hashing.Sha3_256IncrementalAbsorption + 6 ms: build/tests/test.out Sha3Hashing.Sha3_224IncrementalAbsorption + 7 ms: build/tests/test.out Sha3Xof.Shake128KnownAnswerTests + 7 ms: build/tests/test.out Sha3Hashing.Sha3_384KnownAnswerTests + 7 ms: build/tests/test.out Sha3Xof.Shake256KnownAnswerTests + 1054 ms: build/tests/test.out Sha3Xof.Shake128IncrementalAbsorptionAndSqueezing + 1149 ms: build/tests/test.out Sha3Xof.Shake256IncrementalAbsorptionAndSqueezing ``` ## Benchmarking @@ -154,409 +129,611 @@ make perf -j # You must issue this if you built your google-benchmark libra make benchmark -j # Or you can simply use this. ``` -### On 12th Gen Intel(R) Core(TM) i7-1260P ( compiled with Clang-17.0.2 ) +### On 12th Gen Intel(R) Core(TM) i7-1260P ( compiled with `Ubuntu clang version 17.0.2 (1~exp1ubuntu2.1)` ) ```bash -2023-12-22T20:37:13+04:00 +2024-01-20T16:32:22+04:00 Running ./build/perfs/perf.out -Run on (16 X 2858.27 MHz CPU s) +Run on (16 X 4199.2 MHz CPU s) CPU Caches: L1 Data 48 KiB (x8) L1 Instruction 32 KiB (x8) L2 Unified 1280 KiB (x8) L3 Unified 18432 KiB (x1) -Load Average: 0.19, 0.42, 0.52 +Load Average: 1.03, 0.94, 0.72 ------------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations CYCLES CYCLES/ BYTE bytes_per_second ------------------------------------------------------------------------------------------------------------- -keccak-p[1600, 24]_mean 181 ns 181 ns 10 837.237 4.18618 1.02809Gi/s -keccak-p[1600, 24]_median 181 ns 181 ns 10 836.973 4.18486 1.03168Gi/s -keccak-p[1600, 24]_stddev 2.23 ns 2.25 ns 10 1.8613 9.30649m 13.014Mi/s -keccak-p[1600, 24]_cv 1.23 % 1.24 % 10 0.22% 0.22% 1.24% -keccak-p[1600, 24]_min 179 ns 179 ns 10 834.762 4.17381 1.00823Gi/s -keccak-p[1600, 24]_max 185 ns 185 ns 10 840.275 4.20138 1.04163Gi/s -shake256/256/64_mean 405 ns 405 ns 10 1.83538k 5.73557 755.048Mi/s -shake256/256/64_median 399 ns 399 ns 10 1.86572k 5.83036 764.974Mi/s -shake256/256/64_stddev 12.4 ns 12.4 ns 10 99.3426 0.310446 21.748Mi/s -shake256/256/64_cv 3.07 % 3.06 % 10 5.41% 5.41% 2.88% -shake256/256/64_min 398 ns 398 ns 10 1.55283k 4.8526 697.32Mi/s -shake256/256/64_max 438 ns 438 ns 10 1.87209k 5.85027 767.629Mi/s -sha3_256/1024_mean 1537 ns 1537 ns 10 7.07262k 6.69756 655.479Mi/s -sha3_256/1024_median 1529 ns 1529 ns 10 7.07352k 6.69841 658.739Mi/s -sha3_256/1024_stddev 24.4 ns 24.4 ns 10 7.30414 6.9168m 10.3573Mi/s -sha3_256/1024_cv 1.59 % 1.59 % 10 0.10% 0.10% 1.58% -sha3_256/1024_min 1511 ns 1511 ns 10 7.05944k 6.68508 640.517Mi/s -sha3_256/1024_max 1572 ns 1572 ns 10 7.08317k 6.70755 666.49Mi/s -shake128/256/64_mean 404 ns 404 ns 10 1.87853k 5.87041 755.517Mi/s -shake128/256/64_median 403 ns 403 ns 10 1.87905k 5.87203 756.392Mi/s -shake128/256/64_stddev 4.55 ns 4.55 ns 10 6.58881 0.02059 8.40068Mi/s -shake128/256/64_cv 1.13 % 1.13 % 10 0.35% 0.35% 1.11% -shake128/256/64_min 399 ns 399 ns 10 1.86691k 5.8341 736.181Mi/s -shake128/256/64_max 415 ns 415 ns 10 1.88664k 5.89574 765.307Mi/s -sha3_224/256_mean 404 ns 404 ns 10 1.87892k 6.61593 670.059Mi/s -sha3_224/256_median 404 ns 404 ns 10 1.87753k 6.61101 671.18Mi/s -sha3_224/256_stddev 3.60 ns 3.60 ns 10 5.79879 0.0204183 5.91264Mi/s -sha3_224/256_cv 0.89 % 0.89 % 10 0.31% 0.31% 0.88% -sha3_224/256_min 400 ns 400 ns 10 1.87165k 6.59031 657.545Mi/s -sha3_224/256_max 412 ns 412 ns 10 1.89078k 6.65769 676.45Mi/s -sha3_224/16384_mean 21890 ns 21889 ns 10 98.2326k 5.98541 715.351Mi/s -sha3_224/16384_median 21813 ns 21812 ns 10 99.5154k 6.06357 717.561Mi/s -sha3_224/16384_stddev 480 ns 479 ns 10 4.49737k 0.274029 15.2321Mi/s -sha3_224/16384_cv 2.19 % 2.19 % 10 4.58% 4.58% 2.13% -sha3_224/16384_min 21232 ns 21231 ns 10 85.4646k 5.20744 679.143Mi/s -sha3_224/16384_max 23051 ns 23046 ns 10 100.452k 6.12062 737.194Mi/s -sha3_512/16384_mean 43882 ns 43881 ns 10 202.507k 12.3119 357.523Mi/s -sha3_512/16384_median 43953 ns 43950 ns 10 202.444k 12.3081 356.911Mi/s -sha3_512/16384_stddev 581 ns 581 ns 10 341.924 0.0207882 4.7201Mi/s -sha3_512/16384_cv 1.32 % 1.32 % 10 0.17% 0.17% 1.32% -sha3_512/16384_min 43228 ns 43227 ns 10 201.982k 12.2801 350.364Mi/s -sha3_512/16384_max 44771 ns 44771 ns 10 203.122k 12.3493 362.873Mi/s -sha3_384/16384_mean 29721 ns 29720 ns 10 136.298k 8.29467 527.464Mi/s -sha3_384/16384_median 29896 ns 29895 ns 10 136.401k 8.30095 524.211Mi/s -sha3_384/16384_stddev 588 ns 588 ns 10 455.296 0.0277079 10.4574Mi/s -sha3_384/16384_cv 1.98 % 1.98 % 10 0.33% 0.33% 1.98% -sha3_384/16384_min 28915 ns 28915 ns 10 135.525k 8.24765 513.009Mi/s -sha3_384/16384_max 30547 ns 30547 ns 10 136.905k 8.33161 541.967Mi/s -shake128/4096/64_mean 4720 ns 4720 ns 10 21.7808k 5.23577 840.751Mi/s -shake128/4096/64_median 4693 ns 4693 ns 10 21.7731k 5.23392 845.389Mi/s -shake128/4096/64_stddev 83.6 ns 83.6 ns 10 84.5016 0.0203129 14.5613Mi/s -shake128/4096/64_cv 1.77 % 1.77 % 10 0.39% 0.39% 1.73% -shake128/4096/64_min 4639 ns 4639 ns 10 21.6862k 5.21303 806.832Mi/s -shake128/4096/64_max 4917 ns 4917 ns 10 21.8929k 5.26272 855.194Mi/s -sha3_384/4096_mean 7474 ns 7474 ns 10 34.608k 8.35135 528.941Mi/s -sha3_384/4096_median 7413 ns 7413 ns 10 34.606k 8.35088 533.147Mi/s -sha3_384/4096_stddev 146 ns 146 ns 10 77.1853 0.0186258 10.0874Mi/s -sha3_384/4096_cv 1.96 % 1.96 % 10 0.22% 0.22% 1.91% -sha3_384/4096_min 7371 ns 7371 ns 10 34.4843k 8.32149 507.015Mi/s -sha3_384/4096_max 7795 ns 7795 ns 10 34.771k 8.39068 536.187Mi/s -sha3_224/4096_mean 5532 ns 5532 ns 10 25.4595k 6.17349 711.141Mi/s -sha3_224/4096_median 5545 ns 5545 ns 10 25.4525k 6.17181 709.282Mi/s -sha3_224/4096_stddev 94.2 ns 94.2 ns 10 67.6755 0.0164102 11.9676Mi/s -sha3_224/4096_cv 1.70 % 1.70 % 10 0.27% 0.27% 1.68% -sha3_224/4096_min 5432 ns 5432 ns 10 25.3394k 6.14437 686.076Mi/s -sha3_224/4096_max 5733 ns 5733 ns 10 25.6042k 6.20858 724.059Mi/s -shake256/1024/64_mean 1535 ns 1534 ns 10 7.05342k 6.48292 676.32Mi/s -shake256/1024/64_median 1540 ns 1540 ns 10 7.06512k 6.49368 673.576Mi/s -shake256/1024/64_stddev 22.5 ns 22.7 ns 10 42.2604 0.0388423 10.0294Mi/s -shake256/1024/64_cv 1.47 % 1.48 % 10 0.60% 0.60% 1.48% -shake256/1024/64_min 1508 ns 1508 ns 10 6.93534k 6.37439 662.005Mi/s -shake256/1024/64_max 1567 ns 1567 ns 10 7.07912k 6.50655 688.122Mi/s -sha3_512/64_mean 215 ns 215 ns 10 1.00542k 7.85481 566.518Mi/s -sha3_512/64_median 215 ns 215 ns 10 1.00529k 7.85385 568.437Mi/s -sha3_512/64_stddev 1.52 ns 1.52 ns 10 2.25005 0.0175785 3.96599Mi/s -sha3_512/64_cv 0.71 % 0.70 % 10 0.22% 0.22% 0.70% -sha3_512/64_min 214 ns 214 ns 10 1.00278k 7.83418 559.677Mi/s -sha3_512/64_max 218 ns 218 ns 10 1.0107k 7.89611 570.428Mi/s -shake128/64/64_mean 206 ns 206 ns 10 951.967 7.43724 591.569Mi/s -shake128/64/64_median 206 ns 206 ns 10 951.08 7.43031 593.714Mi/s -shake128/64/64_stddev 3.13 ns 3.13 ns 10 4.31938 0.0337451 8.87081Mi/s -shake128/64/64_cv 1.52 % 1.52 % 10 0.45% 0.45% 1.50% -shake128/64/64_min 202 ns 202 ns 10 946.77 7.39664 572.961Mi/s -shake128/64/64_max 213 ns 213 ns 10 961.409 7.51101 602.924Mi/s -sha3_256/16384_mean 22972 ns 22971 ns 10 104.863k 6.38786 681.748Mi/s -sha3_256/16384_median 23020 ns 23020 ns 10 104.825k 6.38551 680.089Mi/s -sha3_256/16384_stddev 426 ns 426 ns 10 259.584 0.0158129 12.6967Mi/s -sha3_256/16384_cv 1.85 % 1.86 % 10 0.25% 0.25% 1.86% -sha3_256/16384_min 22360 ns 22359 ns 10 104.49k 6.36511 664.996Mi/s -sha3_256/16384_max 23542 ns 23542 ns 10 105.24k 6.41079 700.201Mi/s -sha3_512/1024_mean 2903 ns 2903 ns 10 13.506k 12.4136 357.412Mi/s -sha3_512/1024_median 2897 ns 2897 ns 10 13.5026k 12.4105 358.117Mi/s -sha3_512/1024_stddev 28.1 ns 28.1 ns 10 15.7898 0.0145126 3.40526Mi/s -sha3_512/1024_cv 0.97 % 0.97 % 10 0.12% 0.12% 0.95% -sha3_512/1024_min 2879 ns 2879 ns 10 13.4818k 12.3913 348.632Mi/s -sha3_512/1024_max 2976 ns 2976 ns 10 13.5312k 12.4367 360.364Mi/s -sha3_384/1024_mean 1909 ns 1909 ns 10 8.76234k 8.17383 535.654Mi/s -sha3_384/1024_median 1901 ns 1901 ns 10 8.76342k 8.17483 537.835Mi/s -sha3_384/1024_stddev 37.4 ns 37.4 ns 10 17.098 0.0159496 10.4208Mi/s -sha3_384/1024_cv 1.96 % 1.96 % 10 0.20% 0.20% 1.95% -sha3_384/1024_min 1867 ns 1867 ns 10 8.73761k 8.15075 519.858Mi/s -sha3_384/1024_max 1967 ns 1967 ns 10 8.79026k 8.19987 547.652Mi/s -sha3_512/256_mean 795 ns 795 ns 10 3.69684k 11.5526 384.1Mi/s -sha3_512/256_median 791 ns 790 ns 10 3.69629k 11.5509 386.158Mi/s -sha3_512/256_stddev 8.66 ns 8.71 ns 10 4.86183 0.0151932 4.15888Mi/s -sha3_512/256_cv 1.09 % 1.10 % 10 0.13% 0.13% 1.08% -sha3_512/256_min 788 ns 788 ns 10 3.68917k 11.5287 376.657Mi/s -sha3_512/256_max 810 ns 810 ns 10 3.70461k 11.5769 387.253Mi/s -sha3_512/4096_mean 11001 ns 11000 ns 10 50.7833k 12.2075 360.737Mi/s -sha3_512/4096_median 10991 ns 10991 ns 10 50.7718k 12.2048 360.975Mi/s -sha3_512/4096_stddev 148 ns 150 ns 10 86.6208 0.0208223 4.8742Mi/s -sha3_512/4096_cv 1.35 % 1.36 % 10 0.17% 0.17% 1.35% -sha3_512/4096_min 10845 ns 10845 ns 10 50.66k 12.1779 351.713Mi/s -sha3_512/4096_max 11282 ns 11280 ns 10 50.941k 12.2454 365.819Mi/s -sha3_224/64_mean 206 ns 206 ns 10 962.044 10.457 425.523Mi/s -sha3_224/64_median 206 ns 206 ns 10 961.039 10.4461 426.623Mi/s -sha3_224/64_stddev 2.55 ns 2.56 ns 10 4.39172 0.047736 5.14888Mi/s -sha3_224/64_cv 1.24 % 1.24 % 10 0.46% 0.46% 1.21% -sha3_224/64_min 204 ns 204 ns 10 957.616 10.4089 411.863Mi/s -sha3_224/64_max 213 ns 213 ns 10 971.505 10.5598 429.501Mi/s -sha3_256/64_mean 204 ns 204 ns 10 945.093 9.84472 448.01Mi/s -sha3_256/64_median 203 ns 203 ns 10 946.431 9.85865 450.786Mi/s -sha3_256/64_stddev 3.35 ns 3.35 ns 10 4.88271 0.0508616 7.25403Mi/s -sha3_256/64_cv 1.64 % 1.64 % 10 0.52% 0.52% 1.62% -sha3_256/64_min 200 ns 200 ns 10 938.206 9.77298 434.522Mi/s -sha3_256/64_max 211 ns 211 ns 10 952.231 9.91908 456.64Mi/s -shake128/16384/64_mean 18721 ns 18718 ns 10 85.4555k 5.19549 838.278Mi/s -shake128/16384/64_median 18721 ns 18721 ns 10 85.5107k 5.19885 837.893Mi/s -shake128/16384/64_stddev 337 ns 341 ns 10 473.477 0.0287863 15.2678Mi/s -shake128/16384/64_cv 1.80 % 1.82 % 10 0.55% 0.55% 1.82% -shake128/16384/64_min 18248 ns 18223 ns 10 84.8427k 5.15824 814.36Mi/s -shake128/16384/64_max 19262 ns 19262 ns 10 86.2438k 5.24342 860.788Mi/s -sha3_224/1024_mean 1559 ns 1559 ns 10 7.13697k 6.78419 643.852Mi/s -sha3_224/1024_median 1557 ns 1557 ns 10 7.13532k 6.78262 644.457Mi/s -sha3_224/1024_stddev 32.5 ns 32.5 ns 10 19.8615 0.0188797 13.3742Mi/s -sha3_224/1024_cv 2.08 % 2.08 % 10 0.28% 0.28% 2.08% -sha3_224/1024_min 1519 ns 1519 ns 10 7.10831k 6.75694 624.346Mi/s -sha3_224/1024_max 1607 ns 1607 ns 10 7.16852k 6.81418 660.307Mi/s -sha3_384/256_mean 585 ns 585 ns 10 2.69999k 8.88155 495.407Mi/s -sha3_384/256_median 580 ns 580 ns 10 2.69742k 8.87311 500.05Mi/s -sha3_384/256_stddev 11.1 ns 11.0 ns 10 10.1907 0.0335222 9.2635Mi/s -sha3_384/256_cv 1.89 % 1.89 % 10 0.38% 0.38% 1.87% -sha3_384/256_min 575 ns 574 ns 10 2.6921k 8.8556 481.779Mi/s -sha3_384/256_max 602 ns 602 ns 10 2.72732k 8.97144 504.645Mi/s -shake256/64/64_mean 206 ns 206 ns 10 950.677 7.42717 592.476Mi/s -shake256/64/64_median 205 ns 205 ns 10 949.802 7.42033 594.077Mi/s -shake256/64/64_stddev 3.04 ns 3.04 ns 10 3.81902 0.0298361 8.68042Mi/s -shake256/64/64_cv 1.47 % 1.47 % 10 0.40% 0.40% 1.47% -shake256/64/64_min 202 ns 202 ns 10 944.628 7.37991 577.594Mi/s -shake256/64/64_max 211 ns 211 ns 10 957.561 7.48094 602.936Mi/s -sha3_384/64_mean 205 ns 205 ns 10 943.255 8.42192 522.395Mi/s -sha3_384/64_median 202 ns 202 ns 10 943.619 8.42517 527.796Mi/s -sha3_384/64_stddev 3.85 ns 3.85 ns 10 2.36019 0.0210731 9.73153Mi/s -sha3_384/64_cv 1.88 % 1.88 % 10 0.25% 0.25% 1.86% -sha3_384/64_min 201 ns 201 ns 10 939.775 8.39085 506.249Mi/s -sha3_384/64_max 211 ns 211 ns 10 947.313 8.45815 531.712Mi/s -shake256/16384/64_mean 22849 ns 22848 ns 10 105k 6.38374 686.812Mi/s -shake256/16384/64_median 22653 ns 22652 ns 10 104.915k 6.37861 692.507Mi/s -shake256/16384/64_stddev 494 ns 494 ns 10 317.577 0.019308 14.6152Mi/s -shake256/16384/64_cv 2.16 % 2.16 % 10 0.30% 0.30% 2.13% -shake256/16384/64_min 22345 ns 22345 ns 10 104.705k 6.36585 658.002Mi/s -shake256/16384/64_max 23839 ns 23839 ns 10 105.582k 6.41916 701.993Mi/s -shake128/1024/64_mean 1372 ns 1372 ns 10 6.23241k 5.72832 756.408Mi/s -shake128/1024/64_median 1369 ns 1369 ns 10 6.23393k 5.72972 757.977Mi/s -shake128/1024/64_stddev 29.2 ns 29.2 ns 10 25.4898 0.0234281 16.116Mi/s -shake128/1024/64_cv 2.13 % 2.13 % 10 0.41% 0.41% 2.13% -shake128/1024/64_min 1329 ns 1329 ns 10 6.18661k 5.68622 734.011Mi/s -shake128/1024/64_max 1414 ns 1414 ns 10 6.28484k 5.77651 780.865Mi/s -sha3_256/4096_mean 5945 ns 5945 ns 10 27.0144k 6.54418 662.389Mi/s -sha3_256/4096_median 5938 ns 5938 ns 10 26.994k 6.53924 662.99Mi/s -sha3_256/4096_stddev 97.4 ns 97.4 ns 10 80.1626 0.0194192 10.923Mi/s -sha3_256/4096_cv 1.64 % 1.64 % 10 0.30% 0.30% 1.65% -sha3_256/4096_min 5755 ns 5755 ns 10 26.905k 6.51768 647.36Mi/s -sha3_256/4096_max 6082 ns 6081 ns 10 27.1688k 6.58159 684.015Mi/s -sha3_256/256_mean 402 ns 402 ns 10 1.86848k 6.48779 683.156Mi/s -sha3_256/256_median 400 ns 400 ns 10 1.86764k 6.48487 686.565Mi/s -sha3_256/256_stddev 4.91 ns 4.91 ns 10 4.08667 0.0141898 8.19903Mi/s -sha3_256/256_cv 1.22 % 1.22 % 10 0.22% 0.22% 1.20% -sha3_256/256_min 398 ns 398 ns 10 1.86333k 6.4699 662.983Mi/s -sha3_256/256_max 414 ns 414 ns 10 1.87568k 6.51278 690.525Mi/s -shake256/4096/64_mean 5906 ns 5905 ns 10 26.9611k 6.48104 672.022Mi/s -shake256/4096/64_median 5899 ns 5899 ns 10 26.9527k 6.479 672.564Mi/s -shake256/4096/64_stddev 107 ns 107 ns 10 52.6918 0.0126663 12.1185Mi/s -shake256/4096/64_cv 1.81 % 1.81 % 10 0.20% 0.20% 1.80% -shake256/4096/64_min 5751 ns 5751 ns 10 26.88k 6.46155 653.138Mi/s -shake256/4096/64_max 6075 ns 6074 ns 10 27.0706k 6.50736 689.89Mi/s +sha3_512/16384_mean 47078 ns 47073 ns 10 196.189k 11.9278 333.249Mi/s +sha3_512/16384_median 47123 ns 47120 ns 10 196.171k 11.9268 332.894Mi/s +sha3_512/16384_stddev 375 ns 376 ns 10 280.473 0.0170521 2.66302Mi/s +sha3_512/16384_cv 0.80 % 0.80 % 10 0.14% 0.14% 0.80% +sha3_512/16384_min 46537 ns 46536 ns 10 195.8k 11.9042 329.629Mi/s +sha3_512/16384_max 47587 ns 47587 ns 10 196.741k 11.9614 337.072Mi/s +shake256/1024/64_mean 1697 ns 1697 ns 10 7.06365k 6.49232 611.49Mi/s +shake256/1024/64_median 1694 ns 1694 ns 10 7.06479k 6.49337 612.661Mi/s +shake256/1024/64_stddev 22.2 ns 22.2 ns 10 6.53775 6.00896m 7.8507Mi/s +shake256/1024/64_cv 1.31 % 1.31 % 10 0.09% 0.09% 1.28% +shake256/1024/64_min 1670 ns 1670 ns 10 7.05263k 6.4822 591.563Mi/s +shake256/1024/64_max 1754 ns 1754 ns 10 7.0719k 6.49991 621.295Mi/s +sha3_384/1024_mean 2120 ns 2120 ns 10 8.77788k 8.18832 482.406Mi/s +sha3_384/1024_median 2108 ns 2108 ns 10 8.77528k 8.18589 485.01Mi/s +sha3_384/1024_stddev 46.4 ns 46.4 ns 10 16.016 0.0149403 10.3393Mi/s +sha3_384/1024_cv 2.19 % 2.19 % 10 0.18% 0.18% 2.14% +sha3_384/1024_min 2072 ns 2072 ns 10 8.7518k 8.16399 462.86Mi/s +sha3_384/1024_max 2209 ns 2209 ns 10 8.80363k 8.21235 493.522Mi/s +sha3_256/1024_mean 1696 ns 1696 ns 10 7.06628k 6.69155 593.884Mi/s +sha3_256/1024_median 1688 ns 1688 ns 10 7.06231k 6.68779 596.644Mi/s +sha3_256/1024_stddev 29.6 ns 29.6 ns 10 8.20204 7.76708m 10.0777Mi/s +sha3_256/1024_cv 1.74 % 1.74 % 10 0.12% 0.12% 1.70% +sha3_256/1024_min 1665 ns 1665 ns 10 7.05768k 6.68341 568.538Mi/s +sha3_256/1024_max 1771 ns 1771 ns 10 7.07858k 6.7032 605.012Mi/s +sha3_224/256_mean 450 ns 450 ns 10 1.88327k 6.63123 602.425Mi/s +sha3_224/256_median 449 ns 449 ns 10 1.88234k 6.62796 603.486Mi/s +sha3_224/256_stddev 5.76 ns 5.77 ns 10 4.11249 0.0144806 7.58021Mi/s +sha3_224/256_cv 1.28 % 1.28 % 10 0.22% 0.22% 1.26% +sha3_224/256_min 444 ns 444 ns 10 1.87915k 6.61674 583.642Mi/s +sha3_224/256_max 464 ns 464 ns 10 1.89181k 6.66131 610.465Mi/s +sha3_224/1024_mean 1705 ns 1705 ns 10 7.11719k 6.76539 588.434Mi/s +sha3_224/1024_median 1701 ns 1701 ns 10 7.11954k 6.76763 589.703Mi/s +sha3_224/1024_stddev 27.8 ns 27.8 ns 10 13.0981 0.0124506 9.39551Mi/s +sha3_224/1024_cv 1.63 % 1.63 % 10 0.18% 0.18% 1.60% +sha3_224/1024_min 1675 ns 1675 ns 10 7.09816k 6.7473 566.411Mi/s +sha3_224/1024_max 1771 ns 1771 ns 10 7.13747k 6.78467 598.933Mi/s +sha3_384/16384_mean 32770 ns 32768 ns 10 136.423k 8.30227 478.248Mi/s +sha3_384/16384_median 32754 ns 32754 ns 10 136.378k 8.29953 478.439Mi/s +sha3_384/16384_stddev 215 ns 216 ns 10 283.637 0.0172612 3.14624Mi/s +sha3_384/16384_cv 0.66 % 0.66 % 10 0.21% 0.21% 0.66% +sha3_384/16384_min 32385 ns 32382 ns 10 135.995k 8.27622 472.853Mi/s +sha3_384/16384_max 33142 ns 33141 ns 10 136.902k 8.33143 483.934Mi/s +shake256/16384/64_mean 25386 ns 25385 ns 10 104.586k 6.3586 618.132Mi/s +shake256/16384/64_median 25199 ns 25197 ns 10 104.61k 6.36005 622.527Mi/s +shake256/16384/64_stddev 497 ns 497 ns 10 126.374 7.68322m 11.8281Mi/s +shake256/16384/64_cv 1.96 % 1.96 % 10 0.12% 0.12% 1.91% +shake256/16384/64_min 24969 ns 24968 ns 10 104.365k 6.34518 595.478Mi/s +shake256/16384/64_max 26343 ns 26342 ns 10 104.774k 6.37003 628.257Mi/s +shake256/64/64_mean 226 ns 226 ns 10 943.79 7.37336 539.91Mi/s +shake256/64/64_median 225 ns 225 ns 10 943.411 7.3704 542.65Mi/s +shake256/64/64_stddev 3.86 ns 3.87 ns 10 2.08899 0.0163202 8.89311Mi/s +shake256/64/64_cv 1.71 % 1.71 % 10 0.22% 0.22% 1.65% +shake256/64/64_min 224 ns 224 ns 10 941.407 7.35474 515.542Mi/s +shake256/64/64_max 237 ns 237 ns 10 946.997 7.39841 546.137Mi/s +shake128/256/64_mean 451 ns 451 ns 10 1.87769k 5.86778 676.797Mi/s +shake128/256/64_median 449 ns 449 ns 10 1.87731k 5.8666 679.475Mi/s +shake128/256/64_stddev 6.62 ns 6.62 ns 10 6.2563 0.0195509 9.77397Mi/s +shake128/256/64_cv 1.47 % 1.47 % 10 0.33% 0.33% 1.44% +shake128/256/64_min 445 ns 445 ns 10 1.87026k 5.84456 657.905Mi/s +shake128/256/64_max 464 ns 464 ns 10 1.88865k 5.90202 686.472Mi/s +sha3_256/4096_mean 6501 ns 6500 ns 10 26.9396k 6.52608 605.728Mi/s +sha3_256/4096_median 6484 ns 6484 ns 10 26.9337k 6.52463 607.18Mi/s +sha3_256/4096_stddev 95.1 ns 95.1 ns 10 25.8319 6.25772m 8.60066Mi/s +sha3_256/4096_cv 1.46 % 1.46 % 10 0.10% 0.10% 1.42% +sha3_256/4096_min 6407 ns 6407 ns 10 26.9063k 6.51799 582.404Mi/s +sha3_256/4096_max 6760 ns 6760 ns 10 26.9973k 6.54005 614.465Mi/s +shake128/16384/64_mean 20473 ns 20472 ns 10 85.2039k 5.1802 766.287Mi/s +shake128/16384/64_median 20493 ns 20493 ns 10 85.2611k 5.18367 765.451Mi/s +shake128/16384/64_stddev 190 ns 190 ns 10 467.964 0.0284511 7.10767Mi/s +shake128/16384/64_cv 0.93 % 0.93 % 10 0.55% 0.55% 0.93% +shake128/16384/64_min 20201 ns 20201 ns 10 84.4673k 5.13542 754.681Mi/s +shake128/16384/64_max 20787 ns 20785 ns 10 85.7988k 5.21637 776.512Mi/s +sha3_384/64_mean 228 ns 228 ns 10 939.351 8.38706 469.139Mi/s +sha3_384/64_median 226 ns 226 ns 10 947.031 8.45563 472.125Mi/s +sha3_384/64_stddev 4.31 ns 4.31 ns 10 24.7589 0.221062 8.77855Mi/s +sha3_384/64_cv 1.89 % 1.89 % 10 2.64% 2.64% 1.87% +sha3_384/64_min 223 ns 223 ns 10 869.022 7.75913 454.531Mi/s +sha3_384/64_max 235 ns 235 ns 10 950.115 8.48317 479.098Mi/s +shake128/64/64_mean 230 ns 230 ns 10 956.516 7.47278 531.434Mi/s +shake128/64/64_median 229 ns 229 ns 10 957.289 7.47882 533.223Mi/s +shake128/64/64_stddev 3.38 ns 3.38 ns 10 2.75085 0.021491 7.64794Mi/s +shake128/64/64_cv 1.47 % 1.47 % 10 0.29% 0.29% 1.44% +shake128/64/64_min 226 ns 226 ns 10 950.46 7.42547 512.269Mi/s +shake128/64/64_max 238 ns 238 ns 10 959.436 7.49559 539.297Mi/s +shake128/4096/64_mean 5271 ns 5271 ns 10 21.876k 5.25865 752.773Mi/s +shake128/4096/64_median 5262 ns 5262 ns 10 21.8532k 5.25317 753.976Mi/s +shake128/4096/64_stddev 57.5 ns 57.5 ns 10 123.224 0.0296211 8.09954Mi/s +shake128/4096/64_cv 1.09 % 1.09 % 10 0.56% 0.56% 1.08% +shake128/4096/64_min 5204 ns 5204 ns 10 21.6725k 5.20974 733.432Mi/s +shake128/4096/64_max 5409 ns 5409 ns 10 22.0734k 5.3061 762.394Mi/s +sha3_384/4096_mean 8345 ns 8344 ns 10 34.6399k 8.35905 473.729Mi/s +sha3_384/4096_median 8321 ns 8321 ns 10 34.6334k 8.35748 474.958Mi/s +sha3_384/4096_stddev 131 ns 131 ns 10 61.8703 0.0149301 7.29331Mi/s +sha3_384/4096_cv 1.57 % 1.57 % 10 0.18% 0.18% 1.54% +sha3_384/4096_min 8209 ns 8209 ns 10 34.5658k 8.34117 457.128Mi/s +sha3_384/4096_max 8646 ns 8645 ns 10 34.7457k 8.38459 481.45Mi/s +shake256/4096/64_mean 6567 ns 6566 ns 10 26.9033k 6.46715 604.542Mi/s +shake256/4096/64_median 6500 ns 6500 ns 10 26.9327k 6.4742 610.387Mi/s +shake256/4096/64_stddev 169 ns 169 ns 10 198.957 0.0478261 15.3932Mi/s +shake256/4096/64_cv 2.57 % 2.57 % 10 0.74% 0.74% 2.55% +shake256/4096/64_min 6331 ns 6331 ns 10 26.3655k 6.33785 582.197Mi/s +shake256/4096/64_max 6814 ns 6814 ns 10 27.0824k 6.51018 626.632Mi/s +sha3_224/16384_mean 23908 ns 23907 ns 10 99.2535k 6.04762 654.753Mi/s +sha3_224/16384_median 23853 ns 23853 ns 10 99.1969k 6.04417 656.185Mi/s +sha3_224/16384_stddev 223 ns 223 ns 10 257.736 0.0157041 6.08101Mi/s +sha3_224/16384_cv 0.93 % 0.93 % 10 0.26% 0.26% 0.93% +sha3_224/16384_min 23642 ns 23642 ns 10 98.9093k 6.02664 643.085Mi/s +sha3_224/16384_max 24339 ns 24338 ns 10 99.6413k 6.07125 662.037Mi/s +sha3_512/256_mean 862 ns 862 ns 10 3.56823k 11.1507 354.154Mi/s +sha3_512/256_median 855 ns 855 ns 10 3.56642k 11.145 356.95Mi/s +sha3_512/256_stddev 18.2 ns 18.2 ns 10 6.50433 0.020326 7.34607Mi/s +sha3_512/256_cv 2.11 % 2.11 % 10 0.18% 0.18% 2.07% +sha3_512/256_min 843 ns 843 ns 10 3.5613k 11.1291 341.393Mi/s +sha3_512/256_max 894 ns 894 ns 10 3.58141k 11.1919 362.115Mi/s +shake256/256/64_mean 444 ns 444 ns 10 1.8628k 5.82126 688.092Mi/s +shake256/256/64_median 444 ns 444 ns 10 1.8645k 5.82655 687.267Mi/s +shake256/256/64_stddev 3.00 ns 3.00 ns 10 3.06097 9.56552m 4.66754Mi/s +shake256/256/64_cv 0.68 % 0.68 % 10 0.16% 0.16% 0.68% +shake256/256/64_min 438 ns 438 ns 10 1.85831k 5.80723 680.75Mi/s +shake256/256/64_max 448 ns 448 ns 10 1.86634k 5.8323 696.864Mi/s +sha3_512/1024_mean 3163 ns 3163 ns 10 13.0816k 12.0235 328.133Mi/s +sha3_512/1024_median 3150 ns 3150 ns 10 13.0817k 12.0236 329.422Mi/s +sha3_512/1024_stddev 42.8 ns 42.8 ns 10 23.3898 0.021498 4.32226Mi/s +sha3_512/1024_cv 1.35 % 1.35 % 10 0.18% 0.18% 1.32% +sha3_512/1024_min 3132 ns 3132 ns 10 13.0397k 11.985 316.492Mi/s +sha3_512/1024_max 3279 ns 3278 ns 10 13.1255k 12.0639 331.332Mi/s +sha3_384/256_mean 647 ns 647 ns 10 2.69961k 8.88029 448.373Mi/s +sha3_384/256_median 645 ns 644 ns 10 2.69926k 8.87914 449.836Mi/s +sha3_384/256_stddev 6.36 ns 6.37 ns 10 3.21142 0.0105639 4.39387Mi/s +sha3_384/256_cv 0.98 % 0.98 % 10 0.12% 0.12% 0.98% +sha3_384/256_min 639 ns 639 ns 10 2.69471k 8.86419 440.796Mi/s +sha3_384/256_max 658 ns 658 ns 10 2.70448k 8.89632 453.811Mi/s +sha3_512/64_mean 226 ns 226 ns 10 936.855 7.31918 540.182Mi/s +sha3_512/64_median 225 ns 225 ns 10 940.248 7.34569 542.557Mi/s +sha3_512/64_stddev 2.98 ns 2.98 ns 10 12.7421 0.0995479 6.98803Mi/s +sha3_512/64_cv 1.32 % 1.32 % 10 1.36% 1.36% 1.29% +sha3_512/64_min 223 ns 223 ns 10 901.144 7.04019 522.984Mi/s +sha3_512/64_max 233 ns 233 ns 10 944.174 7.37636 546.214Mi/s +sha3_224/64_mean 229 ns 229 ns 10 956.933 10.4014 383.458Mi/s +sha3_224/64_median 228 ns 228 ns 10 956.708 10.399 385.106Mi/s +sha3_224/64_stddev 3.52 ns 3.52 ns 10 3.9546 0.0429848 5.82002Mi/s +sha3_224/64_cv 1.54 % 1.54 % 10 0.41% 0.41% 1.52% +sha3_224/64_min 225 ns 225 ns 10 951.42 10.3415 372.426Mi/s +sha3_224/64_max 236 ns 236 ns 10 963.04 10.4678 390.144Mi/s +sha3_224/4096_mean 6097 ns 6097 ns 10 25.3631k 6.15011 645.17Mi/s +sha3_224/4096_median 6087 ns 6086 ns 10 25.3494k 6.1468 646.198Mi/s +sha3_224/4096_stddev 62.2 ns 62.1 ns 10 76.6335 0.0185823 6.56813Mi/s +sha3_224/4096_cv 1.02 % 1.02 % 10 0.30% 0.30% 1.02% +sha3_224/4096_min 5994 ns 5994 ns 10 25.2631k 6.12588 634.169Mi/s +sha3_224/4096_max 6202 ns 6202 ns 10 25.5052k 6.18458 656.197Mi/s +keccak-p[1600, 24]_mean 201 ns 201 ns 10 836.028 4.18014 947.066Mi/s +keccak-p[1600, 24]_median 202 ns 202 ns 10 835.852 4.17926 944.329Mi/s +keccak-p[1600, 24]_stddev 1.56 ns 1.56 ns 10 0.953858 4.76929m 7.36137Mi/s +keccak-p[1600, 24]_cv 0.77 % 0.77 % 10 0.11% 0.11% 0.78% +keccak-p[1600, 24]_min 198 ns 198 ns 10 834.91 4.17455 936.229Mi/s +keccak-p[1600, 24]_max 204 ns 204 ns 10 837.704 4.18852 962.401Mi/s +sha3_256/16384_mean 25310 ns 25309 ns 10 104.779k 6.38275 618.77Mi/s +sha3_256/16384_median 25196 ns 25194 ns 10 104.793k 6.38362 621.404Mi/s +sha3_256/16384_stddev 476 ns 476 ns 10 209.428 0.0127576 11.4012Mi/s +sha3_256/16384_cv 1.88 % 1.88 % 10 0.20% 0.20% 1.84% +sha3_256/16384_min 24771 ns 24770 ns 10 104.476k 6.36426 592.49Mi/s +sha3_256/16384_max 26424 ns 26423 ns 10 105.07k 6.40045 632.034Mi/s +shake128/1024/64_mean 1511 ns 1511 ns 10 6.24248k 5.73758 686.751Mi/s +shake128/1024/64_median 1504 ns 1504 ns 10 6.23871k 5.73411 689.881Mi/s +shake128/1024/64_stddev 21.5 ns 21.5 ns 10 30.5501 0.0280791 9.64082Mi/s +shake128/1024/64_cv 1.42 % 1.42 % 10 0.49% 0.49% 1.40% +shake128/1024/64_min 1485 ns 1485 ns 10 6.20875k 5.70657 666.979Mi/s +shake128/1024/64_max 1556 ns 1556 ns 10 6.31645k 5.80556 698.568Mi/s +sha3_256/64_mean 227 ns 227 ns 10 945.557 9.84955 403.273Mi/s +sha3_256/64_median 227 ns 227 ns 10 944.72 9.84083 403.878Mi/s +sha3_256/64_stddev 2.80 ns 2.80 ns 10 3.757 0.0391354 4.8898Mi/s +sha3_256/64_cv 1.23 % 1.23 % 10 0.40% 0.40% 1.21% +sha3_256/64_min 224 ns 224 ns 10 939.278 9.78415 391.182Mi/s +sha3_256/64_max 234 ns 234 ns 10 950.283 9.89879 408.808Mi/s +sha3_512/4096_mean 11925 ns 11924 ns 10 49.2576k 11.8408 332.756Mi/s +sha3_512/4096_median 11876 ns 11875 ns 10 49.2575k 11.8408 334.075Mi/s +sha3_512/4096_stddev 159 ns 158 ns 10 106.902 0.0256976 4.34838Mi/s +sha3_512/4096_cv 1.33 % 1.33 % 10 0.22% 0.22% 1.31% +sha3_512/4096_min 11766 ns 11766 ns 10 49.099k 11.8027 322.616Mi/s +sha3_512/4096_max 12298 ns 12297 ns 10 49.4537k 11.8879 337.189Mi/s +sha3_256/256_mean 448 ns 448 ns 10 1.8696k 6.49166 613.655Mi/s +sha3_256/256_median 447 ns 447 ns 10 1.8684k 6.48751 614.217Mi/s +sha3_256/256_stddev 4.74 ns 4.74 ns 10 4.80987 0.016701 6.48242Mi/s +sha3_256/256_cv 1.06 % 1.06 % 10 0.26% 0.26% 1.06% +sha3_256/256_min 441 ns 441 ns 10 1.86248k 6.46695 602.375Mi/s +sha3_256/256_max 456 ns 456 ns 10 1.8775k 6.51911 622.879Mi/s +``` + +### On Apple M1 Max ( compiled with `Apple clang version 15.0.0 (clang-1500.1.0.2.5)` ) + +```bash +2024-01-20T16:13:19+04:00 +Running ./build/benchmarks/bench.out +Run on (10 X 24 MHz CPU s) +CPU Caches: + L1 Data 64 KiB + L1 Instruction 128 KiB + L2 Unified 4096 KiB (x10) +Load Average: 2.73, 3.26, 3.72 +------------------------------------------------------------------------------------- +Benchmark Time CPU Iterations bytes_per_second +------------------------------------------------------------------------------------- +sha3_384/256_mean 734 ns 732 ns 10 395.832Mi/s +sha3_384/256_median 731 ns 730 ns 10 397.168Mi/s +sha3_384/256_stddev 8.10 ns 7.83 ns 10 4.12066Mi/s +sha3_384/256_cv 1.10 % 1.07 % 10 1.04% +sha3_384/256_min 730 ns 730 ns 10 384.11Mi/s +sha3_384/256_max 757 ns 755 ns 10 397.216Mi/s +sha3_256/256_mean 517 ns 515 ns 10 533.919Mi/s +sha3_256/256_median 513 ns 512 ns 10 536.355Mi/s +sha3_256/256_stddev 12.6 ns 7.65 ns 10 7.65515Mi/s +sha3_256/256_cv 2.44 % 1.49 % 10 1.43% +sha3_256/256_min 512 ns 512 ns 10 512.133Mi/s +sha3_256/256_max 553 ns 536 ns 10 536.405Mi/s +sha3_224/256_mean 517 ns 517 ns 10 524.214Mi/s +sha3_224/256_median 517 ns 517 ns 10 524.326Mi/s +sha3_224/256_stddev 0.324 ns 0.345 ns 10 357.38Ki/s +sha3_224/256_cv 0.06 % 0.07 % 10 0.07% +sha3_224/256_min 517 ns 517 ns 10 523.222Mi/s +sha3_224/256_max 518 ns 518 ns 10 524.347Mi/s +sha3_256/16384_mean 29774 ns 29740 ns 10 526.42Mi/s +sha3_256/16384_median 29776 ns 29739 ns 10 526.432Mi/s +sha3_256/16384_stddev 7.40 ns 4.47 ns 10 81.0949Ki/s +sha3_256/16384_cv 0.02 % 0.02 % 10 0.02% +sha3_256/16384_min 29763 ns 29734 ns 10 526.301Mi/s +sha3_256/16384_max 29784 ns 29746 ns 10 526.526Mi/s +shake128/256/64_mean 531 ns 530 ns 10 575.569Mi/s +shake128/256/64_median 530 ns 529 ns 10 576.448Mi/s +shake128/256/64_stddev 2.58 ns 2.61 ns 10 2.79422Mi/s +shake128/256/64_cv 0.49 % 0.49 % 10 0.49% +shake128/256/64_min 530 ns 529 ns 10 567.616Mi/s +shake128/256/64_max 538 ns 538 ns 10 576.479Mi/s +shake256/4096/64_mean 7647 ns 7637 ns 10 519.457Mi/s +shake256/4096/64_median 7647 ns 7637 ns 10 519.456Mi/s +shake256/4096/64_stddev 2.35 ns 0.404 ns 10 28.1501Ki/s +shake256/4096/64_cv 0.03 % 0.01 % 10 0.01% +shake256/4096/64_min 7645 ns 7637 ns 10 519.393Mi/s +shake256/4096/64_max 7652 ns 7638 ns 10 519.485Mi/s +sha3_512/64_mean 253 ns 253 ns 10 483.055Mi/s +sha3_512/64_median 252 ns 251 ns 10 485.423Mi/s +sha3_512/64_stddev 4.10 ns 4.08 ns 10 7.49395Mi/s +sha3_512/64_cv 1.62 % 1.61 % 10 1.55% +sha3_512/64_min 252 ns 251 ns 10 461.727Mi/s +sha3_512/64_max 265 ns 264 ns 10 485.477Mi/s +shake256/64/64_mean 271 ns 271 ns 10 450.256Mi/s +shake256/64/64_median 271 ns 271 ns 10 451.014Mi/s +shake256/64/64_stddev 1.55 ns 1.49 ns 10 2.44808Mi/s +shake256/64/64_cv 0.57 % 0.55 % 10 0.54% +shake256/64/64_min 271 ns 271 ns 10 443.29Mi/s +shake256/64/64_max 276 ns 275 ns 10 451.16Mi/s +shake128/1024/64_mean 1816 ns 1803 ns 10 575.609Mi/s +shake128/1024/64_median 1800 ns 1798 ns 10 577.014Mi/s +shake128/1024/64_stddev 46.9 ns 12.3 ns 10 3.8613Mi/s +shake128/1024/64_cv 2.58 % 0.68 % 10 0.67% +shake128/1024/64_min 1800 ns 1798 ns 10 564.735Mi/s +shake128/1024/64_max 1949 ns 1837 ns 10 577.148Mi/s +sha3_224/1024_mean 2020 ns 2018 ns 10 497.314Mi/s +sha3_224/1024_median 2012 ns 2010 ns 10 499.175Mi/s +sha3_224/1024_stddev 24.7 ns 24.6 ns 10 5.88352Mi/s +sha3_224/1024_cv 1.22 % 1.22 % 10 1.18% +sha3_224/1024_min 2012 ns 2010 ns 10 480.569Mi/s +sha3_224/1024_max 2091 ns 2088 ns 10 499.218Mi/s +sha3_384/16384_mean 37832 ns 37784 ns 10 414.745Mi/s +sha3_384/16384_median 37825 ns 37781 ns 10 414.775Mi/s +sha3_384/16384_stddev 17.4 ns 9.89 ns 10 111.108Ki/s +sha3_384/16384_cv 0.05 % 0.03 % 10 0.03% +sha3_384/16384_min 37816 ns 37779 ns 10 414.441Mi/s +sha3_384/16384_max 37874 ns 37812 ns 10 414.805Mi/s +sha3_224/16384_mean 28393 ns 28349 ns 10 552.113Mi/s +sha3_224/16384_median 28387 ns 28349 ns 10 552.117Mi/s +sha3_224/16384_stddev 22.8 ns 1.44 ns 10 28.6494Ki/s +sha3_224/16384_cv 0.08 % 0.01 % 10 0.01% +sha3_224/16384_min 28377 ns 28347 ns 10 552.07Mi/s +sha3_224/16384_max 28453 ns 28351 ns 10 552.145Mi/s +sha3_512/4096_mean 12512 ns 12497 ns 10 317.471Mi/s +sha3_512/4096_median 12510 ns 12494 ns 10 317.538Mi/s +sha3_512/4096_stddev 7.05 ns 6.11 ns 10 158.695Ki/s +sha3_512/4096_cv 0.06 % 0.05 % 10 0.05% +sha3_512/4096_min 12507 ns 12493 ns 10 317.092Mi/s +sha3_512/4096_max 12531 ns 12511 ns 10 317.561Mi/s +sha3_512/16384_mean 50271 ns 50202 ns 10 312.515Mi/s +sha3_512/16384_median 49954 ns 49894 ns 10 314.385Mi/s +sha3_512/16384_stddev 732 ns 710 ns 10 4.30358Mi/s +sha3_512/16384_cv 1.46 % 1.41 % 10 1.38% +sha3_512/16384_min 49933 ns 49877 ns 10 301.568Mi/s +sha3_512/16384_max 52161 ns 52015 ns 10 314.494Mi/s +keccak-p[1600, 24]_mean 210 ns 209 ns 10 910.792Mi/s +keccak-p[1600, 24]_median 210 ns 209 ns 10 910.799Mi/s +keccak-p[1600, 24]_stddev 0.061 ns 0.014 ns 10 64.3826Ki/s +keccak-p[1600, 24]_cv 0.03 % 0.01 % 10 0.01% +keccak-p[1600, 24]_min 210 ns 209 ns 10 910.691Mi/s +keccak-p[1600, 24]_max 210 ns 209 ns 10 910.854Mi/s +sha3_384/4096_mean 9591 ns 9579 ns 10 412.574Mi/s +sha3_384/4096_median 9590 ns 9578 ns 10 412.594Mi/s +sha3_384/4096_stddev 3.48 ns 1.56 ns 10 68.6457Ki/s +sha3_384/4096_cv 0.04 % 0.02 % 10 0.02% +sha3_384/4096_min 9587 ns 9577 ns 10 412.459Mi/s +sha3_384/4096_max 9597 ns 9582 ns 10 412.639Mi/s +shake128/64/64_mean 278 ns 278 ns 10 438.941Mi/s +shake128/64/64_median 278 ns 278 ns 10 438.957Mi/s +shake128/64/64_stddev 0.152 ns 0.032 ns 10 51.5279Ki/s +shake128/64/64_cv 0.05 % 0.01 % 10 0.01% +shake128/64/64_min 278 ns 278 ns 10 438.871Mi/s +shake128/64/64_max 279 ns 278 ns 10 439.012Mi/s +sha3_256/1024_mean 1995 ns 1992 ns 10 505.488Mi/s +sha3_256/1024_median 1992 ns 1989 ns 10 506.203Mi/s +sha3_256/1024_stddev 8.81 ns 8.81 ns 10 2.20981Mi/s +sha3_256/1024_cv 0.44 % 0.44 % 10 0.44% +sha3_256/1024_min 1990 ns 1989 ns 10 499.201Mi/s +sha3_256/1024_max 2020 ns 2017 ns 10 506.235Mi/s +sha3_256/4096_mean 7659 ns 7651 ns 10 514.586Mi/s +sha3_256/4096_median 7644 ns 7635 ns 10 515.624Mi/s +sha3_256/4096_stddev 42.9 ns 40.6 ns 10 2.69423Mi/s +sha3_256/4096_cv 0.56 % 0.53 % 10 0.52% +sha3_256/4096_min 7641 ns 7634 ns 10 507.011Mi/s +sha3_256/4096_max 7780 ns 7765 ns 10 515.672Mi/s +sha3_256/64_mean 271 ns 271 ns 10 338.262Mi/s +sha3_256/64_median 271 ns 271 ns 10 338.262Mi/s +sha3_256/64_stddev 0.098 ns 0.030 ns 10 37.8028Ki/s +sha3_256/64_cv 0.04 % 0.01 % 10 0.01% +sha3_256/64_min 271 ns 271 ns 10 338.205Mi/s +sha3_256/64_max 271 ns 271 ns 10 338.336Mi/s +sha3_512/256_mean 925 ns 923 ns 10 330.489Mi/s +sha3_512/256_median 925 ns 923 ns 10 330.494Mi/s +sha3_512/256_stddev 0.613 ns 0.060 ns 10 22.1389Ki/s +sha3_512/256_cv 0.07 % 0.01 % 10 0.01% +sha3_512/256_min 924 ns 923 ns 10 330.451Mi/s +sha3_512/256_max 926 ns 924 ns 10 330.522Mi/s +sha3_224/4096_mean 7273 ns 7263 ns 10 541.606Mi/s +sha3_224/4096_median 7244 ns 7234 ns 10 543.707Mi/s +sha3_224/4096_stddev 89.4 ns 90.2 ns 10 6.52127Mi/s +sha3_224/4096_cv 1.23 % 1.24 % 10 1.20% +sha3_224/4096_min 7241 ns 7233 ns 10 523.05Mi/s +sha3_224/4096_max 7527 ns 7519 ns 10 543.734Mi/s +sha3_224/64_mean 272 ns 271 ns 10 323.182Mi/s +sha3_224/64_median 272 ns 271 ns 10 323.247Mi/s +sha3_224/64_stddev 0.194 ns 0.157 ns 10 191.57Ki/s +sha3_224/64_cv 0.07 % 0.06 % 10 0.06% +sha3_224/64_min 272 ns 271 ns 10 322.66Mi/s +sha3_224/64_max 272 ns 272 ns 10 323.285Mi/s +sha3_384/64_mean 252 ns 251 ns 10 425.074Mi/s +sha3_384/64_median 252 ns 251 ns 10 425.092Mi/s +sha3_384/64_stddev 0.071 ns 0.028 ns 10 47.8782Ki/s +sha3_384/64_cv 0.03 % 0.01 % 10 0.01% +sha3_384/64_min 251 ns 251 ns 10 424.952Mi/s +sha3_384/64_max 252 ns 251 ns 10 425.113Mi/s +shake128/16384/64_mean 24833 ns 24796 ns 10 632.6Mi/s +shake128/16384/64_median 24829 ns 24795 ns 10 632.626Mi/s +shake128/16384/64_stddev 15.4 ns 3.05 ns 10 79.7538Ki/s +shake128/16384/64_cv 0.06 % 0.01 % 10 0.01% +shake128/16384/64_min 24817 ns 24793 ns 10 632.438Mi/s +shake128/16384/64_max 24866 ns 24802 ns 10 632.673Mi/s +shake256/256/64_mean 513 ns 513 ns 10 595.316Mi/s +shake256/256/64_median 513 ns 513 ns 10 595.332Mi/s +shake256/256/64_stddev 0.163 ns 0.034 ns 10 40.9232Ki/s +shake256/256/64_cv 0.03 % 0.01 % 10 0.01% +shake256/256/64_min 513 ns 513 ns 10 595.222Mi/s +shake256/256/64_max 514 ns 513 ns 10 595.354Mi/s +shake256/16384/64_mean 29791 ns 29754 ns 10 527.191Mi/s +shake256/16384/64_median 29786 ns 29752 ns 10 527.232Mi/s +shake256/16384/64_stddev 15.4 ns 5.76 ns 10 104.489Ki/s +shake256/16384/64_cv 0.05 % 0.02 % 10 0.02% +shake256/16384/64_min 29778 ns 29748 ns 10 526.944Mi/s +shake256/16384/64_max 29830 ns 29768 ns 10 527.297Mi/s +sha3_384/1024_mean 2407 ns 2404 ns 10 425.212Mi/s +sha3_384/1024_median 2407 ns 2404 ns 10 425.212Mi/s +sha3_384/1024_stddev 0.667 ns 0.064 ns 10 11.5857Ki/s +sha3_384/1024_cv 0.03 % 0.00 % 10 0.00% +sha3_384/1024_min 2406 ns 2404 ns 10 425.197Mi/s +sha3_384/1024_max 2409 ns 2404 ns 10 425.227Mi/s +shake256/1024/64_mean 1993 ns 1990 ns 10 521.301Mi/s +shake256/1024/64_median 1993 ns 1990 ns 10 521.31Mi/s +shake256/1024/64_stddev 1.56 ns 0.101 ns 10 27.1673Ki/s +shake256/1024/64_cv 0.08 % 0.01 % 10 0.01% +shake256/1024/64_min 1992 ns 1990 ns 10 521.241Mi/s +shake256/1024/64_max 1997 ns 1991 ns 10 521.33Mi/s +sha3_512/1024_mean 3332 ns 3328 ns 10 311.783Mi/s +sha3_512/1024_median 3332 ns 3328 ns 10 311.77Mi/s +sha3_512/1024_stddev 1.02 ns 0.431 ns 10 41.3057Ki/s +sha3_512/1024_cv 0.03 % 0.01 % 10 0.01% +sha3_512/1024_min 3331 ns 3327 ns 10 311.746Mi/s +sha3_512/1024_max 3334 ns 3328 ns 10 311.858Mi/s +shake128/4096/64_mean 6361 ns 6352 ns 10 624.596Mi/s +shake128/4096/64_median 6360 ns 6351 ns 10 624.689Mi/s +shake128/4096/64_stddev 4.09 ns 1.91 ns 10 192.71Ki/s +shake128/4096/64_cv 0.06 % 0.03 % 10 0.03% +shake128/4096/64_min 6356 ns 6350 ns 10 624.315Mi/s +shake128/4096/64_max 6369 ns 6355 ns 10 624.792Mi/s ``` -### On ARM Cortex-A72 i.e. Raspberry Pi 4B ( compiled with Clang-16.0.6 ) +### On ARM Cortex-A72 i.e. Raspberry Pi 4B ( compiled with `gcc version 13.2.0 (Ubuntu 13.2.0-4ubuntu3)` ) ```bash -2023-12-22T22:28:29+05:30 +2024-01-20T16:20:37+04:00 Running ./build/perfs/perf.out Run on (4 X 1800 MHz CPU s) CPU Caches: L1 Data 32 KiB (x4) L1 Instruction 48 KiB (x4) L2 Unified 1024 KiB (x1) -Load Average: 2.66, 3.10, 2.04 +Load Average: 0.59, 0.74, 1.06 ------------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations CYCLES CYCLES/ BYTE bytes_per_second ------------------------------------------------------------------------------------------------------------- -sha3_384/256_mean 4943 ns 4941 ns 10 8.88107k 29.214 58.6747Mi/s -sha3_384/256_median 4943 ns 4941 ns 10 8.88101k 29.2138 58.6736Mi/s -sha3_384/256_stddev 1.08 ns 0.599 ns 10 0.37254 1.22546m 7.28721Ki/s +sha3_384/256_mean 2809 ns 2809 ns 10 5.04865k 16.6074 103.22Mi/s +sha3_384/256_median 2809 ns 2809 ns 10 5.04856k 16.6071 103.221Mi/s +sha3_384/256_stddev 0.443 ns 0.294 ns 10 0.230906 759.558u 11.0744Ki/s sha3_384/256_cv 0.02 % 0.01 % 10 0.00% 0.00% 0.01% -sha3_384/256_min 4941 ns 4940 ns 10 8.88067k 29.2127 58.666Mi/s -sha3_384/256_max 4944 ns 4942 ns 10 8.88169k 29.2161 58.686Mi/s -sha3_224/1024_mean 13233 ns 13229 ns 10 23.7753k 22.6001 75.84Mi/s -sha3_224/1024_median 13233 ns 13229 ns 10 23.774k 22.5989 75.8375Mi/s -sha3_224/1024_stddev 5.09 ns 4.63 ns 10 6.07897 5.77849m 27.1864Ki/s -sha3_224/1024_cv 0.04 % 0.04 % 10 0.03% 0.03% 0.04% -sha3_224/1024_min 13224 ns 13221 ns 10 23.7677k 22.5929 75.7993Mi/s -sha3_224/1024_max 13241 ns 13236 ns 10 23.7864k 22.6106 75.8855Mi/s -shake128/256/64_mean 3553 ns 3552 ns 10 6.38394k 19.9498 85.9233Mi/s -shake128/256/64_median 3553 ns 3552 ns 10 6.38396k 19.9499 85.9209Mi/s -shake128/256/64_stddev 0.927 ns 0.618 ns 10 0.327318 1.02287m 15.3177Ki/s -shake128/256/64_cv 0.03 % 0.02 % 10 0.01% 0.01% 0.02% -shake128/256/64_min 3552 ns 3551 ns 10 6.38342k 19.9482 85.8936Mi/s -shake128/256/64_max 3555 ns 3553 ns 10 6.38454k 19.9517 85.942Mi/s -sha3_256/256_mean 3496 ns 3495 ns 10 6.28129k 21.81 78.5915Mi/s -sha3_256/256_median 3496 ns 3495 ns 10 6.28122k 21.8098 78.5964Mi/s -sha3_256/256_stddev 1.65 ns 1.07 ns 10 0.237549 824.824u 24.5521Ki/s -sha3_256/256_cv 0.05 % 0.03 % 10 0.00% 0.00% 0.03% -sha3_256/256_min 3495 ns 3494 ns 10 6.28105k 21.8092 78.5259Mi/s -sha3_256/256_max 3500 ns 3498 ns 10 6.28184k 21.8119 78.6115Mi/s -sha3_512/16384_mean 367366 ns 367240 ns 10 659.951k 40.1235 42.7134Mi/s -sha3_512/16384_median 367342 ns 367217 ns 10 659.873k 40.1187 42.716Mi/s -sha3_512/16384_stddev 300 ns 279 ns 10 471.478 0.0286648 33.279Ki/s -sha3_512/16384_cv 0.08 % 0.08 % 10 0.07% 0.07% 0.08% -sha3_512/16384_min 366945 ns 366874 ns 10 659.364k 40.0878 42.674Mi/s -sha3_512/16384_max 367765 ns 367578 ns 10 660.686k 40.1682 42.7559Mi/s -shake256/256/64_mean 3501 ns 3500 ns 10 6.29015k 19.6567 87.2001Mi/s -shake256/256/64_median 3501 ns 3500 ns 10 6.29016k 19.6567 87.1962Mi/s -shake256/256/64_stddev 0.743 ns 0.490 ns 10 0.228355 713.608u 12.5052Ki/s -shake256/256/64_cv 0.02 % 0.01 % 10 0.00% 0.00% 0.01% -shake256/256/64_min 3500 ns 3499 ns 10 6.28986k 19.6558 87.1844Mi/s -shake256/256/64_max 3502 ns 3500 ns 10 6.29056k 19.658 87.2211Mi/s -sha3_512/256_mean 6548 ns 6546 ns 10 11.7647k 36.7648 46.6206Mi/s -sha3_512/256_median 6548 ns 6547 ns 10 11.7674k 36.7731 46.6163Mi/s -sha3_512/256_stddev 3.88 ns 3.93 ns 10 6.45845 0.0201827 28.6518Ki/s -sha3_512/256_cv 0.06 % 0.06 % 10 0.05% 0.05% 0.06% -sha3_512/256_min 6543 ns 6541 ns 10 11.7555k 36.736 46.5696Mi/s -sha3_512/256_max 6556 ns 6553 ns 10 11.7721k 36.7877 46.6592Mi/s -sha3_384/64_mean 1713 ns 1713 ns 10 3.07815k 27.4835 62.3696Mi/s -sha3_384/64_median 1713 ns 1713 ns 10 3.0782k 27.4839 62.3714Mi/s -sha3_384/64_stddev 0.979 ns 0.880 ns 10 1.45719 0.0130106 32.8174Ki/s -sha3_384/64_cv 0.06 % 0.05 % 10 0.05% 0.05% 0.05% -sha3_384/64_min 1712 ns 1711 ns 10 3.07641k 27.468 62.324Mi/s -sha3_384/64_max 1714 ns 1714 ns 10 3.0801k 27.5009 62.4126Mi/s -sha3_384/16384_mean 255253 ns 255187 ns 10 458.662k 27.9127 61.409Mi/s -sha3_384/16384_median 255323 ns 255260 ns 10 458.788k 27.9204 61.3915Mi/s -sha3_384/16384_stddev 178 ns 177 ns 10 329.243 0.0200367 43.5852Ki/s -sha3_384/16384_cv 0.07 % 0.07 % 10 0.07% 0.07% 0.07% -sha3_384/16384_min 254996 ns 254914 ns 10 458.152k 27.8817 61.364Mi/s -sha3_384/16384_max 255452 ns 255374 ns 10 458.957k 27.9307 61.4746Mi/s -sha3_256/64_mean 1872 ns 1871 ns 10 3.36344k 35.0358 48.9256Mi/s -sha3_256/64_median 1872 ns 1871 ns 10 3.36339k 35.0354 48.9247Mi/s -sha3_256/64_stddev 0.255 ns 0.155 ns 10 0.157224 1.63775m 4.15276Ki/s -sha3_256/64_cv 0.01 % 0.01 % 10 0.00% 0.00% 0.01% -sha3_256/64_min 1871 ns 1871 ns 10 3.36325k 35.0339 48.9202Mi/s -sha3_256/64_max 1872 ns 1871 ns 10 3.36375k 35.039 48.9341Mi/s -shake256/1024/64_mean 13212 ns 13209 ns 10 23.7416k 21.8213 78.5551Mi/s -shake256/1024/64_median 13212 ns 13208 ns 10 23.7398k 21.8197 78.5571Mi/s -shake256/1024/64_stddev 2.97 ns 2.57 ns 10 3.89029 3.57563m 15.654Ki/s -shake256/1024/64_cv 0.02 % 0.02 % 10 0.02% 0.02% 0.02% -shake256/1024/64_min 13208 ns 13205 ns 10 23.7381k 21.8181 78.5244Mi/s -shake256/1024/64_max 13217 ns 13214 ns 10 23.7489k 21.828 78.5745Mi/s -sha3_512/64_mean 1712 ns 1711 ns 10 3.07627k 24.0333 71.3246Mi/s -sha3_512/64_median 1712 ns 1711 ns 10 3.07618k 24.0327 71.3253Mi/s -sha3_512/64_stddev 0.572 ns 0.380 ns 10 0.367503 2.87112m 16.2209Ki/s -sha3_512/64_cv 0.03 % 0.02 % 10 0.01% 0.01% 0.02% -sha3_512/64_min 1711 ns 1711 ns 10 3.0756k 24.0281 71.3013Mi/s -sha3_512/64_max 1713 ns 1712 ns 10 3.07693k 24.0385 71.3504Mi/s -shake256/64/64_mean 1878 ns 1877 ns 10 3.37372k 26.3572 65.0314Mi/s -shake256/64/64_median 1878 ns 1877 ns 10 3.37375k 26.3574 65.0351Mi/s -shake256/64/64_stddev 0.837 ns 0.588 ns 10 0.239178 1.86858m 20.8361Ki/s -shake256/64/64_cv 0.04 % 0.03 % 10 0.01% 0.01% 0.03% -shake256/64/64_min 1877 ns 1877 ns 10 3.3734k 26.3547 64.9787Mi/s -shake256/64/64_max 1880 ns 1879 ns 10 3.37421k 26.361 65.0501Mi/s -shake128/64/64_mean 1926 ns 1925 ns 10 3.46032k 27.0337 63.4077Mi/s -shake128/64/64_median 1926 ns 1925 ns 10 3.46036k 27.0341 63.4059Mi/s -shake128/64/64_stddev 0.745 ns 0.558 ns 10 0.61368 4.79438m 18.8164Ki/s -shake128/64/64_cv 0.04 % 0.03 % 10 0.02% 0.02% 0.03% -shake128/64/64_min 1925 ns 1924 ns 10 3.45934k 27.0261 63.3744Mi/s -shake128/64/64_max 1927 ns 1926 ns 10 3.46119k 27.0406 63.4346Mi/s -sha3_224/256_mean 3509 ns 3508 ns 10 6.30456k 22.1991 77.216Mi/s -sha3_224/256_median 3508 ns 3507 ns 10 6.30454k 22.1991 77.2192Mi/s -sha3_224/256_stddev 0.834 ns 0.583 ns 10 0.182439 642.39u 13.1501Ki/s -sha3_224/256_cv 0.02 % 0.02 % 10 0.00% 0.00% 0.02% -sha3_224/256_min 3508 ns 3507 ns 10 6.3043k 22.1982 77.1953Mi/s -sha3_224/256_max 3510 ns 3509 ns 10 6.30483k 22.2001 77.2309Mi/s -sha3_256/1024_mean 13216 ns 13212 ns 10 23.7456k 22.4864 76.2252Mi/s -sha3_256/1024_median 13215 ns 13211 ns 10 23.7457k 22.4864 76.2324Mi/s -sha3_256/1024_stddev 7.18 ns 7.23 ns 10 13.2008 0.0125007 42.7151Ki/s -sha3_256/1024_cv 0.05 % 0.05 % 10 0.06% 0.06% 0.05% -sha3_256/1024_min 13206 ns 13202 ns 10 23.731k 22.4725 76.1697Mi/s -sha3_256/1024_max 13226 ns 13222 ns 10 23.76k 22.5 76.2834Mi/s -sha3_224/64_mean 1885 ns 1884 ns 10 3.38691k 36.8142 46.5611Mi/s -sha3_224/64_median 1885 ns 1884 ns 10 3.38676k 36.8126 46.561Mi/s -sha3_224/64_stddev 0.402 ns 0.312 ns 10 0.431011 4.68491m 7.88678Ki/s -sha3_224/64_cv 0.02 % 0.02 % 10 0.01% 0.01% 0.02% -sha3_224/64_min 1884 ns 1884 ns 10 3.38652k 36.81 46.5443Mi/s -sha3_224/64_max 1886 ns 1885 ns 10 3.38806k 36.8268 46.5747Mi/s -shake256/16384/64_mean 196275 ns 196221 ns 10 352.67k 21.4415 79.9407Mi/s -shake256/16384/64_median 196279 ns 196219 ns 10 352.626k 21.4388 79.9415Mi/s -shake256/16384/64_stddev 151 ns 149 ns 10 274.278 0.0166754 62.2572Ki/s -shake256/16384/64_cv 0.08 % 0.08 % 10 0.08% 0.08% 0.08% -shake256/16384/64_min 196056 ns 196018 ns 10 352.321k 21.4203 79.844Mi/s -shake256/16384/64_max 196519 ns 196458 ns 10 353.092k 21.4672 80.0234Mi/s -shake128/16384/64_mean 159491 ns 159442 ns 10 286.555k 17.4219 98.3813Mi/s -shake128/16384/64_median 159429 ns 159382 ns 10 286.44k 17.4149 98.4179Mi/s -shake128/16384/64_stddev 252 ns 243 ns 10 411.169 0.0249981 153.209Ki/s -shake128/16384/64_cv 0.16 % 0.15 % 10 0.14% 0.14% 0.15% -shake128/16384/64_min 159179 ns 159148 ns 10 286.062k 17.3919 98.064Mi/s -shake128/16384/64_max 160029 ns 159957 ns 10 287.434k 17.4753 98.5625Mi/s -sha3_512/4096_mean 91818 ns 91794 ns 10 164.996k 39.6624 43.2193Mi/s -sha3_512/4096_median 91824 ns 91803 ns 10 165k 39.6635 43.2153Mi/s -sha3_512/4096_stddev 23.7 ns 19.0 ns 10 26.2386 6.30735m 9.15854Ki/s -sha3_512/4096_cv 0.03 % 0.02 % 10 0.02% 0.02% 0.02% -sha3_512/4096_min 91779 ns 91764 ns 10 164.962k 39.6544 43.2106Mi/s -sha3_512/4096_max 91842 ns 91813 ns 10 165.027k 39.67 43.2335Mi/s -sha3_384/4096_mean 64650 ns 64633 ns 10 116.172k 28.0338 61.1454Mi/s -sha3_384/4096_median 64648 ns 64634 ns 10 116.179k 28.0354 61.1446Mi/s -sha3_384/4096_stddev 18.7 ns 16.2 ns 10 22.5271 5.43608m 15.713Ki/s -sha3_384/4096_cv 0.03 % 0.03 % 10 0.02% 0.02% 0.03% -sha3_384/4096_min 64620 ns 64609 ns 10 116.135k 28.0248 61.1187Mi/s -sha3_384/4096_max 64679 ns 64662 ns 10 116.212k 28.0434 61.1682Mi/s -shake128/1024/64_mean 11665 ns 11661 ns 10 20.9578k 19.2627 88.9798Mi/s -shake128/1024/64_median 11664 ns 11661 ns 10 20.957k 19.2619 88.9816Mi/s -shake128/1024/64_stddev 3.34 ns 2.47 ns 10 2.30432 2.11794m 19.2591Ki/s -shake128/1024/64_cv 0.03 % 0.02 % 10 0.01% 0.01% 0.02% -shake128/1024/64_min 11659 ns 11657 ns 10 20.9553k 19.2604 88.9427Mi/s -shake128/1024/64_max 11672 ns 11666 ns 10 20.961k 19.2656 89.0097Mi/s -sha3_256/16384_mean 196371 ns 196308 ns 10 352.791k 21.4907 79.7498Mi/s -sha3_256/16384_median 196381 ns 196293 ns 10 352.73k 21.4869 79.756Mi/s -sha3_256/16384_stddev 166 ns 181 ns 10 384.334 0.0234122 75.1445Ki/s -sha3_256/16384_cv 0.08 % 0.09 % 10 0.11% 0.11% 0.09% -sha3_256/16384_min 196068 ns 196002 ns 10 352.231k 21.4566 79.6597Mi/s -sha3_256/16384_max 196561 ns 196530 ns 10 353.287k 21.5209 79.8744Mi/s -sha3_256/4096_mean 50418 ns 50402 ns 10 90.5849k 21.944 78.1071Mi/s -sha3_256/4096_median 50412 ns 50397 ns 10 90.5776k 21.9423 78.1157Mi/s -sha3_256/4096_stddev 25.0 ns 17.2 ns 10 20.5874 4.98725m 27.2261Ki/s -sha3_256/4096_cv 0.05 % 0.03 % 10 0.02% 0.02% 0.03% -sha3_256/4096_min 50386 ns 50377 ns 10 90.5655k 21.9393 78.0548Mi/s -sha3_256/4096_max 50474 ns 50436 ns 10 90.6293k 21.9548 78.1461Mi/s -sha3_224/4096_mean 47250 ns 47238 ns 10 84.9067k 20.5884 83.2579Mi/s -sha3_224/4096_median 47246 ns 47232 ns 10 84.8967k 20.586 83.2685Mi/s -sha3_224/4096_stddev 18.7 ns 15.6 ns 10 19.1114 4.63419m 28.1343Ki/s -sha3_224/4096_cv 0.04 % 0.03 % 10 0.02% 0.02% 0.03% -sha3_224/4096_min 47230 ns 47222 ns 10 84.8926k 20.585 83.2037Mi/s -sha3_224/4096_max 47288 ns 47269 ns 10 84.9454k 20.5978 83.2869Mi/s -keccak-p[1600, 24]_mean 1596 ns 1596 ns 10 2.86863k 14.3432 119.512Mi/s -keccak-p[1600, 24]_median 1596 ns 1596 ns 10 2.8686k 14.343 119.513Mi/s -keccak-p[1600, 24]_stddev 0.372 ns 0.257 ns 10 0.119287 596.436u 19.6693Ki/s +sha3_384/256_min 2808 ns 2808 ns 10 5.04844k 16.6067 103.202Mi/s +sha3_384/256_max 2809 ns 2809 ns 10 5.04919k 16.6092 103.238Mi/s +sha3_384/4096_mean 35057 ns 35057 ns 10 63.0092k 15.2049 112.733Mi/s +sha3_384/4096_median 35056 ns 35056 ns 10 63.009k 15.2049 112.735Mi/s +sha3_384/4096_stddev 7.12 ns 4.96 ns 10 2.59006 625.015u 16.3467Ki/s +sha3_384/4096_cv 0.02 % 0.01 % 10 0.00% 0.00% 0.01% +sha3_384/4096_min 35045 ns 35049 ns 10 63.0054k 15.204 112.707Mi/s +sha3_384/4096_max 35069 ns 35065 ns 10 63.0137k 15.206 112.757Mi/s +sha3_512/256_mean 3665 ns 3665 ns 10 6.58859k 20.5893 83.2633Mi/s +sha3_512/256_median 3665 ns 3665 ns 10 6.58842k 20.5888 83.2666Mi/s +sha3_512/256_stddev 0.699 ns 0.691 ns 10 1.39233 4.35103m 16.0704Ki/s +sha3_512/256_cv 0.02 % 0.02 % 10 0.02% 0.02% 0.02% +sha3_512/256_min 3664 ns 3664 ns 10 6.58654k 20.5829 83.2405Mi/s +sha3_512/256_max 3666 ns 3666 ns 10 6.59106k 20.5971 83.285Mi/s +sha3_512/64_mean 1044 ns 1044 ns 10 1.87614k 14.6573 116.958Mi/s +sha3_512/64_median 1044 ns 1044 ns 10 1.87613k 14.6573 116.958Mi/s +sha3_512/64_stddev 0.153 ns 0.087 ns 10 0.0163249 127.538u 9.93599Ki/s +sha3_512/64_cv 0.01 % 0.01 % 10 0.00% 0.00% 0.01% +sha3_512/64_min 1043 ns 1044 ns 10 1.87612k 14.6572 116.941Mi/s +sha3_512/64_max 1044 ns 1044 ns 10 1.87617k 14.6576 116.973Mi/s +sha3_224/64_mean 1116 ns 1116 ns 10 2.00537k 21.7975 78.6454Mi/s +sha3_224/64_median 1116 ns 1116 ns 10 2.0053k 21.7968 78.6396Mi/s +sha3_224/64_stddev 0.522 ns 0.476 ns 10 0.876682 9.52915m 34.3417Ki/s +sha3_224/64_cv 0.05 % 0.04 % 10 0.04% 0.04% 0.04% +sha3_224/64_min 1115 ns 1115 ns 10 2.00457k 21.7888 78.5768Mi/s +sha3_224/64_max 1116 ns 1117 ns 10 2.00743k 21.8199 78.6826Mi/s +shake128/256/64_mean 2065 ns 2065 ns 10 3.71232k 11.601 147.754Mi/s +shake128/256/64_median 2065 ns 2065 ns 10 3.71231k 11.601 147.759Mi/s +shake128/256/64_stddev 0.346 ns 0.259 ns 10 0.05968 186.5u 18.9387Ki/s +shake128/256/64_cv 0.02 % 0.01 % 10 0.00% 0.00% 0.01% +shake128/256/64_min 2065 ns 2065 ns 10 3.71223k 11.6007 147.714Mi/s +shake128/256/64_max 2066 ns 2066 ns 10 3.71243k 11.6013 147.775Mi/s +shake256/1024/64_mean 7317 ns 7317 ns 10 13.1525k 12.0887 141.804Mi/s +shake256/1024/64_median 7316 ns 7317 ns 10 13.1524k 12.0886 141.807Mi/s +shake256/1024/64_stddev 1.39 ns 0.839 ns 10 0.780465 717.339u 16.6491Ki/s +shake256/1024/64_cv 0.02 % 0.01 % 10 0.01% 0.01% 0.01% +shake256/1024/64_min 7315 ns 7316 ns 10 13.1515k 12.0877 141.786Mi/s +shake256/1024/64_max 7319 ns 7318 ns 10 13.1539k 12.09 141.833Mi/s +shake256/64/64_mean 1136 ns 1136 ns 10 2.04103k 15.9455 107.497Mi/s +shake256/64/64_median 1135 ns 1135 ns 10 2.04083k 15.944 107.514Mi/s +shake256/64/64_stddev 0.444 ns 0.405 ns 10 0.470208 3.6735m 39.2242Ki/s +shake256/64/64_cv 0.04 % 0.04 % 10 0.02% 0.02% 0.04% +shake256/64/64_min 1135 ns 1135 ns 10 2.04057k 15.942 107.411Mi/s +shake256/64/64_max 1137 ns 1136 ns 10 2.0421k 15.9539 107.53Mi/s +sha3_512/16384_mean 198486 ns 198488 ns 10 356.755k 21.6899 79.0278Mi/s +sha3_512/16384_median 198490 ns 198484 ns 10 356.73k 21.6883 79.0292Mi/s +sha3_512/16384_stddev 87.8 ns 86.8 ns 10 155.444 9.45064m 35.3845Ki/s +sha3_512/16384_cv 0.04 % 0.04 % 10 0.04% 0.04% 0.04% +sha3_512/16384_min 198380 ns 198386 ns 10 356.604k 21.6807 78.9526Mi/s +sha3_512/16384_max 198655 ns 198677 ns 10 357.142k 21.7134 79.0683Mi/s +sha3_224/256_mean 2017 ns 2016 ns 10 3.62396k 12.7604 134.316Mi/s +sha3_224/256_median 2016 ns 2016 ns 10 3.62407k 12.7608 134.32Mi/s +sha3_224/256_stddev 1.32 ns 1.14 ns 10 1.49815 5.27519m 78.095Ki/s +sha3_224/256_cv 0.07 % 0.06 % 10 0.04% 0.04% 0.06% +sha3_224/256_min 2014 ns 2015 ns 10 3.62128k 12.751 134.183Mi/s +sha3_224/256_max 2019 ns 2018 ns 10 3.62651k 12.7694 134.443Mi/s +shake256/256/64_mean 2027 ns 2027 ns 10 3.64366k 11.3864 150.534Mi/s +shake256/256/64_median 2027 ns 2027 ns 10 3.64369k 11.3865 150.531Mi/s +shake256/256/64_stddev 0.573 ns 0.508 ns 10 0.670622 2.09569m 38.647Ki/s +shake256/256/64_cv 0.03 % 0.03 % 10 0.02% 0.02% 0.03% +shake256/256/64_min 2026 ns 2027 ns 10 3.64274k 11.3836 150.479Mi/s +shake256/256/64_max 2028 ns 2028 ns 10 3.64468k 11.3896 150.587Mi/s +shake128/4096/64_mean 22446 ns 22446 ns 10 40.3432k 9.69789 176.746Mi/s +shake128/4096/64_median 22446 ns 22446 ns 10 40.3431k 9.69786 176.745Mi/s +shake128/4096/64_stddev 5.63 ns 3.78 ns 10 1.29789 311.993u 30.4457Ki/s +shake128/4096/64_cv 0.03 % 0.02 % 10 0.00% 0.00% 0.02% +shake128/4096/64_min 22437 ns 22440 ns 10 40.3411k 9.69739 176.695Mi/s +shake128/4096/64_max 22454 ns 22453 ns 10 40.345k 9.69831 176.798Mi/s +sha3_224/4096_mean 25960 ns 25961 ns 10 46.6643k 11.3153 151.496Mi/s +sha3_224/4096_median 25961 ns 25961 ns 10 46.6632k 11.315 151.495Mi/s +sha3_224/4096_stddev 3.64 ns 2.81 ns 10 4.74013 1.1494m 16.7751Ki/s +sha3_224/4096_cv 0.01 % 0.01 % 10 0.01% 0.01% 0.01% +sha3_224/4096_min 25955 ns 25957 ns 10 46.6591k 11.314 151.464Mi/s +sha3_224/4096_max 25966 ns 25966 ns 10 46.6757k 11.3181 151.516Mi/s +sha3_384/1024_mean 8920 ns 8921 ns 10 16.035k 14.958 114.601Mi/s +sha3_384/1024_median 8920 ns 8920 ns 10 16.0346k 14.9577 114.607Mi/s +sha3_384/1024_stddev 2.74 ns 2.41 ns 10 1.37496 1.28261m 31.6902Ki/s +sha3_384/1024_cv 0.03 % 0.03 % 10 0.01% 0.01% 0.03% +sha3_384/1024_min 8918 ns 8919 ns 10 16.0336k 14.9568 114.516Mi/s +sha3_384/1024_max 8928 ns 8927 ns 10 16.0383k 14.9611 114.624Mi/s +sha3_384/16384_mean 138096 ns 138094 ns 10 248.196k 15.1044 113.479Mi/s +sha3_384/16384_median 138136 ns 138125 ns 10 248.218k 15.1058 113.453Mi/s +sha3_384/16384_stddev 174 ns 177 ns 10 334.625 0.0203642 148.935Ki/s +sha3_384/16384_cv 0.13 % 0.13 % 10 0.13% 0.13% 0.13% +sha3_384/16384_min 137890 ns 137893 ns 10 247.834k 15.0824 113.199Mi/s +sha3_384/16384_max 138420 ns 138435 ns 10 248.857k 15.1446 113.644Mi/s +sha3_384/64_mean 1060 ns 1060 ns 10 1.90527k 17.0114 100.768Mi/s +sha3_384/64_median 1060 ns 1060 ns 10 1.90537k 17.0123 100.775Mi/s +sha3_384/64_stddev 0.320 ns 0.336 ns 10 0.722997 6.45533m 32.7525Ki/s +sha3_384/64_cv 0.03 % 0.03 % 10 0.04% 0.04% 0.03% +sha3_384/64_min 1059 ns 1059 ns 10 1.9041k 17.0009 100.711Mi/s +sha3_384/64_max 1061 ns 1061 ns 10 1.90615k 17.0192 100.82Mi/s +sha3_512/4096_mean 49713 ns 49714 ns 10 89.3619k 21.4812 79.8016Mi/s +sha3_512/4096_median 49709 ns 49713 ns 10 89.362k 21.4813 79.8036Mi/s +sha3_512/4096_stddev 18.5 ns 16.0 ns 10 23.0994 5.55275m 26.2649Ki/s +sha3_512/4096_cv 0.04 % 0.03 % 10 0.03% 0.03% 0.03% +sha3_512/4096_min 49692 ns 49698 ns 10 89.3225k 21.4718 79.7405Mi/s +sha3_512/4096_max 49757 ns 49752 ns 10 89.4112k 21.4931 79.8279Mi/s +shake256/16384/64_mean 107051 ns 107054 ns 10 192.402k 11.6976 146.526Mi/s +shake256/16384/64_median 107085 ns 107083 ns 10 192.414k 11.6983 146.484Mi/s +shake256/16384/64_stddev 204 ns 201 ns 10 354.397 0.0215465 281.878Ki/s +shake256/16384/64_cv 0.19 % 0.19 % 10 0.18% 0.18% 0.19% +shake256/16384/64_min 106814 ns 106825 ns 10 192.022k 11.6745 146.204Mi/s +shake256/16384/64_max 107278 ns 107289 ns 10 192.818k 11.7229 146.838Mi/s +sha3_256/256_mean 2013 ns 2013 ns 10 3.61754k 12.5609 136.463Mi/s +sha3_256/256_median 2013 ns 2013 ns 10 3.61741k 12.5605 136.461Mi/s +sha3_256/256_stddev 0.419 ns 0.485 ns 10 0.982835 3.41262m 33.6599Ki/s +sha3_256/256_cv 0.02 % 0.02 % 10 0.03% 0.03% 0.02% +sha3_256/256_min 2012 ns 2012 ns 10 3.61602k 12.5556 136.405Mi/s +sha3_256/256_max 2013 ns 2014 ns 10 3.61951k 12.5677 136.521Mi/s +shake128/64/64_mean 1173 ns 1173 ns 10 2.10873k 16.4745 104.053Mi/s +shake128/64/64_median 1173 ns 1173 ns 10 2.1087k 16.4742 104.052Mi/s +shake128/64/64_stddev 0.223 ns 0.145 ns 10 0.0999248 780.663u 13.1417Ki/s +shake128/64/64_cv 0.02 % 0.01 % 10 0.00% 0.00% 0.01% +shake128/64/64_min 1173 ns 1173 ns 10 2.10862k 16.4736 104.033Mi/s +shake128/64/64_max 1173 ns 1173 ns 10 2.10895k 16.4762 104.072Mi/s +sha3_256/16384_mean 108442 ns 108447 ns 10 194.921k 11.8738 144.362Mi/s +sha3_256/16384_median 108380 ns 108382 ns 10 194.809k 11.867 144.448Mi/s +sha3_256/16384_stddev 311 ns 313 ns 10 564.025 0.0343582 425.739Ki/s +sha3_256/16384_cv 0.29 % 0.29 % 10 0.29% 0.29% 0.29% +sha3_256/16384_min 108049 ns 108061 ns 10 194.233k 11.8319 143.676Mi/s +sha3_256/16384_max 108964 ns 108964 ns 10 195.85k 11.9304 144.877Mi/s +shake128/16384/64_mean 87417 ns 87420 ns 10 157.108k 9.55182 179.434Mi/s +shake128/16384/64_median 87360 ns 87366 ns 10 157.032k 9.54721 179.544Mi/s +shake128/16384/64_stddev 148 ns 149 ns 10 250.579 0.0152346 312.577Ki/s +shake128/16384/64_cv 0.17 % 0.17 % 10 0.16% 0.16% 0.17% +shake128/16384/64_min 87169 ns 87162 ns 10 156.637k 9.52314 178.964Mi/s +shake128/16384/64_max 87651 ns 87649 ns 10 157.471k 9.57387 179.964Mi/s +sha3_256/64_mean 1112 ns 1112 ns 10 1.99811k 20.8136 82.3571Mi/s +sha3_256/64_median 1112 ns 1112 ns 10 1.99807k 20.8133 82.3575Mi/s +sha3_256/64_stddev 0.946 ns 0.937 ns 10 1.70455 0.0177557 71.0998Ki/s +sha3_256/64_cv 0.09 % 0.08 % 10 0.09% 0.09% 0.08% +sha3_256/64_min 1110 ns 1110 ns 10 1.99573k 20.7889 82.2608Mi/s +sha3_256/64_max 1113 ns 1113 ns 10 2.00053k 20.8389 82.4637Mi/s +sha3_512/1024_mean 13218 ns 13218 ns 10 23.7582k 21.8365 78.499Mi/s +sha3_512/1024_median 13219 ns 13219 ns 10 23.7595k 21.8378 78.4923Mi/s +sha3_512/1024_stddev 5.95 ns 5.35 ns 10 9.384 8.625m 32.5618Ki/s +sha3_512/1024_cv 0.05 % 0.04 % 10 0.04% 0.04% 0.04% +sha3_512/1024_min 13206 ns 13208 ns 10 23.7442k 21.8237 78.4465Mi/s +sha3_512/1024_max 13227 ns 13227 ns 10 23.7738k 21.8509 78.5595Mi/s +shake128/1024/64_mean 6500 ns 6500 ns 10 11.6832k 10.7382 159.624Mi/s +shake128/1024/64_median 6500 ns 6500 ns 10 11.6832k 10.7382 159.629Mi/s +shake128/1024/64_stddev 1.67 ns 1.11 ns 10 0.150961 138.751u 27.8682Ki/s +shake128/1024/64_cv 0.03 % 0.02 % 10 0.00% 0.00% 0.02% +shake128/1024/64_min 6498 ns 6499 ns 10 11.6829k 10.7379 159.562Mi/s +shake128/1024/64_max 6504 ns 6503 ns 10 11.6834k 10.7384 159.655Mi/s +shake256/4096/64_mean 27567 ns 27568 ns 10 49.5523k 11.9116 143.908Mi/s +shake256/4096/64_median 27565 ns 27567 ns 10 49.5523k 11.9116 143.913Mi/s +shake256/4096/64_stddev 5.84 ns 3.87 ns 10 1.24483 299.238u 20.6705Ki/s +shake256/4096/64_cv 0.02 % 0.01 % 10 0.00% 0.00% 0.01% +shake256/4096/64_min 27561 ns 27563 ns 10 49.5502k 11.9111 143.871Mi/s +shake256/4096/64_max 27580 ns 27575 ns 10 49.5539k 11.912 143.933Mi/s +sha3_256/4096_mean 27886 ns 27885 ns 10 50.1184k 12.1411 141.176Mi/s +sha3_256/4096_median 27890 ns 27888 ns 10 50.1157k 12.1404 141.163Mi/s +sha3_256/4096_stddev 21.3 ns 20.7 ns 10 36.7096 8.89282m 107.092Ki/s +sha3_256/4096_cv 0.08 % 0.07 % 10 0.07% 0.07% 0.07% +sha3_256/4096_min 27854 ns 27857 ns 10 50.0751k 12.1306 141.039Mi/s +sha3_256/4096_max 27915 ns 27913 ns 10 50.1666k 12.1528 141.319Mi/s +sha3_224/1024_mean 7334 ns 7334 ns 10 13.1817k 12.5301 136.794Mi/s +sha3_224/1024_median 7335 ns 7334 ns 10 13.1818k 12.5302 136.796Mi/s +sha3_224/1024_stddev 1.39 ns 0.948 ns 10 0.795119 755.817u 18.0968Ki/s +sha3_224/1024_cv 0.02 % 0.01 % 10 0.01% 0.01% 0.01% +sha3_224/1024_min 7332 ns 7333 ns 10 13.1806k 12.5291 136.77Mi/s +sha3_224/1024_max 7336 ns 7335 ns 10 13.1832k 12.5316 136.816Mi/s +keccak-p[1600, 24]_mean 862 ns 862 ns 10 1.54907k 7.74537 221.318Mi/s +keccak-p[1600, 24]_median 862 ns 862 ns 10 1.54908k 7.74541 221.322Mi/s +keccak-p[1600, 24]_stddev 0.200 ns 0.136 ns 10 0.016987 84.935u 35.8821Ki/s keccak-p[1600, 24]_cv 0.02 % 0.02 % 10 0.00% 0.00% 0.02% -keccak-p[1600, 24]_min 1596 ns 1596 ns 10 2.86855k 14.3427 119.478Mi/s -keccak-p[1600, 24]_max 1597 ns 1596 ns 10 2.86896k 14.3448 119.541Mi/s -shake128/4096/64_mean 40857 ns 40845 ns 10 73.408k 17.6462 97.1305Mi/s -shake128/4096/64_median 40837 ns 40827 ns 10 73.3832k 17.6402 97.1728Mi/s -shake128/4096/64_stddev 37.4 ns 35.4 ns 10 45.2564 0.0108789 86.1828Ki/s -shake128/4096/64_cv 0.09 % 0.09 % 10 0.06% 0.06% 0.09% -shake128/4096/64_min 40827 ns 40818 ns 10 73.3707k 17.6372 96.9358Mi/s -shake128/4096/64_max 40941 ns 40927 ns 10 73.5104k 17.6708 97.1944Mi/s -sha3_384/1024_mean 16247 ns 16241 ns 10 29.1909k 27.2303 62.9467Mi/s -sha3_384/1024_median 16247 ns 16241 ns 10 29.1903k 27.2298 62.948Mi/s -sha3_384/1024_stddev 4.54 ns 2.93 ns 10 1.83977 1.7162m 11.6162Ki/s -sha3_384/1024_cv 0.03 % 0.02 % 10 0.01% 0.01% 0.02% -sha3_384/1024_min 16241 ns 16237 ns 10 29.1888k 27.2283 62.9262Mi/s -sha3_384/1024_max 16254 ns 16247 ns 10 29.1941k 27.2333 62.9616Mi/s -sha3_224/16384_mean 185317 ns 185264 ns 10 332.97k 20.2882 84.4834Mi/s -sha3_224/16384_median 185397 ns 185354 ns 10 333.143k 20.2987 84.4422Mi/s -sha3_224/16384_stddev 219 ns 224 ns 10 419.686 0.0255719 104.672Ki/s -sha3_224/16384_cv 0.12 % 0.12 % 10 0.13% 0.13% 0.12% -sha3_224/16384_min 184990 ns 184929 ns 10 332.357k 20.2509 84.3771Mi/s -sha3_224/16384_max 185566 ns 185497 ns 10 333.366k 20.3123 84.6365Mi/s -sha3_512/1024_mean 24255 ns 24247 ns 10 43.5775k 40.0528 42.7932Mi/s -sha3_512/1024_median 24255 ns 24248 ns 10 43.5768k 40.0522 42.7908Mi/s -sha3_512/1024_stddev 6.55 ns 5.50 ns 10 6.53887 6.01m 9.94629Ki/s -sha3_512/1024_cv 0.03 % 0.02 % 10 0.02% 0.02% 0.02% -sha3_512/1024_min 24246 ns 24239 ns 10 43.5684k 40.0445 42.7794Mi/s -sha3_512/1024_max 24263 ns 24255 ns 10 43.588k 40.0625 42.8066Mi/s -shake256/4096/64_mean 50411 ns 50398 ns 10 90.5836k 21.7749 78.7198Mi/s -shake256/4096/64_median 50411 ns 50395 ns 10 90.5739k 21.7726 78.7242Mi/s -shake256/4096/64_stddev 17.2 ns 15.7 ns 10 23.0543 5.54191m 25.1537Ki/s -shake256/4096/64_cv 0.03 % 0.03 % 10 0.03% 0.03% 0.03% -shake256/4096/64_min 50385 ns 50376 ns 10 90.5585k 21.7689 78.6656Mi/s -shake256/4096/64_max 50447 ns 50432 ns 10 90.6309k 21.7863 78.7533Mi/s +keccak-p[1600, 24]_min 861 ns 862 ns 10 1.54904k 7.74522 221.258Mi/s +keccak-p[1600, 24]_max 862 ns 862 ns 10 1.54909k 7.74546 221.375Mi/s +sha3_256/1024_mean 7366 ns 7366 ns 10 13.2407k 12.5385 136.712Mi/s +sha3_256/1024_median 7365 ns 7366 ns 10 13.2411k 12.5389 136.721Mi/s +sha3_256/1024_stddev 3.53 ns 3.43 ns 10 6.45902 6.1165m 65.257Ki/s +sha3_256/1024_cv 0.05 % 0.05 % 10 0.05% 0.05% 0.05% +sha3_256/1024_min 7361 ns 7362 ns 10 13.2305k 12.5289 136.62Mi/s +sha3_256/1024_max 7371 ns 7371 ns 10 13.2523k 12.5496 136.792Mi/s +sha3_224/16384_mean 101629 ns 101627 ns 10 182.654k 11.1293 154.011Mi/s +sha3_224/16384_median 101604 ns 101585 ns 10 182.538k 11.1222 154.075Mi/s +sha3_224/16384_stddev 177 ns 180 ns 10 324.501 0.0197722 279.456Ki/s +sha3_224/16384_cv 0.17 % 0.18 % 10 0.18% 0.18% 0.18% +sha3_224/16384_min 101395 ns 101407 ns 10 182.286k 11.1069 153.594Mi/s +sha3_224/16384_max 101893 ns 101903 ns 10 183.169k 11.1607 154.346Mi/s ``` ## Usage @@ -668,3 +845,6 @@ SHAKE-256 Input : a6506638e34127e0a8415241479c968c20422f46497663eaf244f205a756f0b3 Output : ce679163b642380365c3c11dcbca7a36ddd01cefba35b8ec18ad937268f584999c6e8ae061c251dd ``` + +> [!NOTE] +> This library doesn't expose any raw pointer + length -based interfaces, rather everything is wrapped under much safer `std::span`` - which one can easily create from `std::{array, vector}` or even raw pointers and length pair. See https://en.cppreference.com/w/cpp/container/span. I made this choice because this gives us much better type safety and compile-time error reporting. diff --git a/gtest-parallel b/gtest-parallel new file mode 160000 index 0000000..96f4f90 --- /dev/null +++ b/gtest-parallel @@ -0,0 +1 @@ +Subproject commit 96f4f904922f9bf66689e749c40f314845baaac8 diff --git a/include/keccak.hpp b/include/keccak.hpp index 0ebe439..3241ab8 100644 --- a/include/keccak.hpp +++ b/include/keccak.hpp @@ -125,25 +125,486 @@ compute_rcs() // https://dx.doi.org/10.s6028/NIST.FIPS.202 static constexpr auto RC = compute_rcs(); -// Keccak-p[1600, 24] step mapping function θ, see section 3.2.1 of SHA3 -// specification https://dx.doi.org/10.6028/NIST.FIPS.202 -inline static constexpr void -theta(uint64_t* const state) +#if defined __APPLE__ && defined __aarch64__ // On Apple Silicon + +// Keccak-p[1600, 24] round function, applying all five step mapping functions, +// updating state array. Note this implementation of round function applies four +// consecutive rounds in a single call i.e. if you invoke it to apply round `i` +// +// - it first applies round `i` +// - then round `i+1` +// - and then round `i+2` +// - and finally round `i+3` +// +// See section 3.3 of https://dx.doi.org/10.6028/NIST.FIPS.202 +// +// This Keccak round function implementation is specifically targeting Apple +// Silicon CPUs. And this implementation collects a lot of inspiration from +// https://github.com/bwesterb/armed-keccak.git. +static inline constexpr void +roundx4(uint64_t* const state, const size_t ridx) { - uint64_t c[5]{}; - uint64_t d[5]; + std::array bc{}, d{}; + uint64_t t = 0; +// Round ridx + 0 #if defined __clang__ // Following // https://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations #pragma clang loop unroll(enable) #pragma clang loop vectorize(enable) +#pragma clang loop interleave(enable) #elif defined __GNUG__ // Following // https://gcc.gnu.org/onlinedocs/gcc/Loop-Specific-Pragmas.html#Loop-Specific-Pragmas #pragma GCC unroll 5 +#pragma GCC ivdep +#endif + for (size_t i = 0; i < 25; i += 5) { + bc[0] ^= state[i + 0]; + bc[1] ^= state[i + 1]; + bc[2] ^= state[i + 2]; + bc[3] ^= state[i + 3]; + bc[4] ^= state[i + 4]; + } + + d[0] = bc[4] ^ std::rotl(bc[1], 1); + d[1] = bc[0] ^ std::rotl(bc[2], 1); + d[2] = bc[1] ^ std::rotl(bc[3], 1); + d[3] = bc[2] ^ std::rotl(bc[4], 1); + d[4] = bc[3] ^ std::rotl(bc[0], 1); + + bc[0] = state[0] ^ d[0]; + t = state[6] ^ d[1]; + bc[1] = std::rotl(t, ROT[6]); + t = state[12] ^ d[2]; + bc[2] = std::rotl(t, ROT[12]); + t = state[18] ^ d[3]; + bc[3] = std::rotl(t, ROT[18]); + t = state[24] ^ d[4]; + bc[4] = std::rotl(t, ROT[24]); + + state[0] = bc[0] ^ (bc[2] & ~bc[1]) ^ RC[ridx]; + state[6] = bc[1] ^ (bc[3] & ~bc[2]); + state[12] = bc[2] ^ (bc[4] & ~bc[3]); + state[18] = bc[3] ^ (bc[0] & ~bc[4]); + state[24] = bc[4] ^ (bc[1] & ~bc[0]); + + t = state[10] ^ d[0]; + bc[2] = std::rotl(t, ROT[10]); + t = state[16] ^ d[1]; + bc[3] = std::rotl(t, ROT[16]); + t = state[22] ^ d[2]; + bc[4] = std::rotl(t, ROT[22]); + t = state[3] ^ d[3]; + bc[0] = std::rotl(t, ROT[3]); + t = state[9] ^ d[4]; + bc[1] = std::rotl(t, ROT[9]); + + state[10] = bc[0] ^ (bc[2] & ~bc[1]); + state[16] = bc[1] ^ (bc[3] & ~bc[2]); + state[22] = bc[2] ^ (bc[4] & ~bc[3]); + state[3] = bc[3] ^ (bc[0] & ~bc[4]); + state[9] = bc[4] ^ (bc[1] & ~bc[0]); + + t = state[20] ^ d[0]; + bc[4] = std::rotl(t, ROT[20]); + t = state[1] ^ d[1]; + bc[0] = std::rotl(t, ROT[1]); + t = state[7] ^ d[2]; + bc[1] = std::rotl(t, ROT[7]); + t = state[13] ^ d[3]; + bc[2] = std::rotl(t, ROT[13]); + t = state[19] ^ d[4]; + bc[3] = std::rotl(t, ROT[19]); + + state[20] = bc[0] ^ (bc[2] & ~bc[1]); + state[1] = bc[1] ^ (bc[3] & ~bc[2]); + state[7] = bc[2] ^ (bc[4] & ~bc[3]); + state[13] = bc[3] ^ (bc[0] & ~bc[4]); + state[19] = bc[4] ^ (bc[1] & ~bc[0]); + + t = state[5] ^ d[0]; + bc[1] = std::rotl(t, ROT[5]); + t = state[11] ^ d[1]; + bc[2] = std::rotl(t, ROT[11]); + t = state[17] ^ d[2]; + bc[3] = std::rotl(t, ROT[17]); + t = state[23] ^ d[3]; + bc[4] = std::rotl(t, ROT[23]); + t = state[4] ^ d[4]; + bc[0] = std::rotl(t, ROT[4]); + + state[5] = bc[0] ^ (bc[2] & ~bc[1]); + state[11] = bc[1] ^ (bc[3] & ~bc[2]); + state[17] = bc[2] ^ (bc[4] & ~bc[3]); + state[23] = bc[3] ^ (bc[0] & ~bc[4]); + state[4] = bc[4] ^ (bc[1] & ~bc[0]); + + t = state[15] ^ d[0]; + bc[3] = std::rotl(t, ROT[15]); + t = state[21] ^ d[1]; + bc[4] = std::rotl(t, ROT[21]); + t = state[2] ^ d[2]; + bc[0] = std::rotl(t, ROT[2]); + t = state[8] ^ d[3]; + bc[1] = std::rotl(t, ROT[8]); + t = state[14] ^ d[4]; + bc[2] = std::rotl(t, ROT[14]); + + state[15] = bc[0] ^ (bc[2] & ~bc[1]); + state[21] = bc[1] ^ (bc[3] & ~bc[2]); + state[2] = bc[2] ^ (bc[4] & ~bc[3]); + state[8] = bc[3] ^ (bc[0] & ~bc[4]); + state[14] = bc[4] ^ (bc[1] & ~bc[0]); + + // Round ridx + 1 + std::fill(bc.begin(), bc.end(), 0x00); + +#if defined __clang__ +#pragma clang loop unroll(enable) +#pragma clang loop vectorize(enable) +#pragma clang loop interleave(enable) +#elif defined __GNUG__ +#pragma GCC unroll 5 +#pragma GCC ivdep +#endif + for (size_t i = 0; i < 25; i += 5) { + bc[0] ^= state[i + 0]; + bc[1] ^= state[i + 1]; + bc[2] ^= state[i + 2]; + bc[3] ^= state[i + 3]; + bc[4] ^= state[i + 4]; + } + + d[0] = bc[4] ^ std::rotl(bc[1], 1); + d[1] = bc[0] ^ std::rotl(bc[2], 1); + d[2] = bc[1] ^ std::rotl(bc[3], 1); + d[3] = bc[2] ^ std::rotl(bc[4], 1); + d[4] = bc[3] ^ std::rotl(bc[0], 1); + + bc[0] = state[0] ^ d[0]; + t = state[16] ^ d[1]; + bc[1] = std::rotl(t, ROT[6]); + t = state[7] ^ d[2]; + bc[2] = std::rotl(t, ROT[12]); + t = state[23] ^ d[3]; + bc[3] = std::rotl(t, ROT[18]); + t = state[14] ^ d[4]; + bc[4] = std::rotl(t, ROT[24]); + + state[0] = bc[0] ^ (bc[2] & ~bc[1]) ^ RC[ridx + 1]; + state[16] = bc[1] ^ (bc[3] & ~bc[2]); + state[7] = bc[2] ^ (bc[4] & ~bc[3]); + state[23] = bc[3] ^ (bc[0] & ~bc[4]); + state[14] = bc[4] ^ (bc[1] & ~bc[0]); + + t = state[20] ^ d[0]; + bc[2] = std::rotl(t, ROT[10]); + t = state[11] ^ d[1]; + bc[3] = std::rotl(t, ROT[16]); + t = state[2] ^ d[2]; + bc[4] = std::rotl(t, ROT[22]); + t = state[18] ^ d[3]; + bc[0] = std::rotl(t, ROT[3]); + t = state[9] ^ d[4]; + bc[1] = std::rotl(t, ROT[9]); + + state[20] = bc[0] ^ (bc[2] & ~bc[1]); + state[11] = bc[1] ^ (bc[3] & ~bc[2]); + state[2] = bc[2] ^ (bc[4] & ~bc[3]); + state[18] = bc[3] ^ (bc[0] & ~bc[4]); + state[9] = bc[4] ^ (bc[1] & ~bc[0]); + + t = state[15] ^ d[0]; + bc[4] = std::rotl(t, ROT[20]); + t = state[6] ^ d[1]; + bc[0] = std::rotl(t, ROT[1]); + t = state[22] ^ d[2]; + bc[1] = std::rotl(t, ROT[7]); + t = state[13] ^ d[3]; + bc[2] = std::rotl(t, ROT[13]); + t = state[4] ^ d[4]; + bc[3] = std::rotl(t, ROT[19]); + + state[15] = bc[0] ^ (bc[2] & ~bc[1]); + state[6] = bc[1] ^ (bc[3] & ~bc[2]); + state[22] = bc[2] ^ (bc[4] & ~bc[3]); + state[13] = bc[3] ^ (bc[0] & ~bc[4]); + state[4] = bc[4] ^ (bc[1] & ~bc[0]); + + t = state[10] ^ d[0]; + bc[1] = std::rotl(t, ROT[5]); + t = state[1] ^ d[1]; + bc[2] = std::rotl(t, ROT[11]); + t = state[17] ^ d[2]; + bc[3] = std::rotl(t, ROT[17]); + t = state[8] ^ d[3]; + bc[4] = std::rotl(t, ROT[23]); + t = state[24] ^ d[4]; + bc[0] = std::rotl(t, ROT[4]); + + state[10] = bc[0] ^ (bc[2] & ~bc[1]); + state[1] = bc[1] ^ (bc[3] & ~bc[2]); + state[17] = bc[2] ^ (bc[4] & ~bc[3]); + state[8] = bc[3] ^ (bc[0] & ~bc[4]); + state[24] = bc[4] ^ (bc[1] & ~bc[0]); + + t = state[5] ^ d[0]; + bc[3] = std::rotl(t, ROT[15]); + t = state[21] ^ d[1]; + bc[4] = std::rotl(t, ROT[21]); + t = state[12] ^ d[2]; + bc[0] = std::rotl(t, ROT[2]); + t = state[3] ^ d[3]; + bc[1] = std::rotl(t, ROT[8]); + t = state[19] ^ d[4]; + bc[2] = std::rotl(t, ROT[14]); + + state[5] = bc[0] ^ (bc[2] & ~bc[1]); + state[21] = bc[1] ^ (bc[3] & ~bc[2]); + state[12] = bc[2] ^ (bc[4] & ~bc[3]); + state[3] = bc[3] ^ (bc[0] & ~bc[4]); + state[19] = bc[4] ^ (bc[1] & ~bc[0]); + + // Round ridx + 2 + std::fill(bc.begin(), bc.end(), 0x00); + +#if defined __clang__ +#pragma clang loop unroll(enable) +#pragma clang loop vectorize(enable) +#pragma clang loop interleave(enable) +#elif defined __GNUG__ +#pragma GCC unroll 5 +#pragma GCC ivdep +#endif + for (size_t i = 0; i < 25; i += 5) { + bc[0] ^= state[i + 0]; + bc[1] ^= state[i + 1]; + bc[2] ^= state[i + 2]; + bc[3] ^= state[i + 3]; + bc[4] ^= state[i + 4]; + } + + d[0] = bc[4] ^ std::rotl(bc[1], 1); + d[1] = bc[0] ^ std::rotl(bc[2], 1); + d[2] = bc[1] ^ std::rotl(bc[3], 1); + d[3] = bc[2] ^ std::rotl(bc[4], 1); + d[4] = bc[3] ^ std::rotl(bc[0], 1); + + bc[0] = state[0] ^ d[0]; + t = state[11] ^ d[1]; + bc[1] = std::rotl(t, ROT[6]); + t = state[22] ^ d[2]; + bc[2] = std::rotl(t, ROT[12]); + t = state[8] ^ d[3]; + bc[3] = std::rotl(t, ROT[18]); + t = state[19] ^ d[4]; + bc[4] = std::rotl(t, ROT[24]); + + state[0] = bc[0] ^ (bc[2] & ~bc[1]) ^ RC[ridx + 2]; + state[11] = bc[1] ^ (bc[3] & ~bc[2]); + state[22] = bc[2] ^ (bc[4] & ~bc[3]); + state[8] = bc[3] ^ (bc[0] & ~bc[4]); + state[19] = bc[4] ^ (bc[1] & ~bc[0]); + + t = state[15] ^ d[0]; + bc[2] = std::rotl(t, ROT[10]); + t = state[1] ^ d[1]; + bc[3] = std::rotl(t, ROT[16]); + t = state[12] ^ d[2]; + bc[4] = std::rotl(t, ROT[22]); + t = state[23] ^ d[3]; + bc[0] = std::rotl(t, ROT[3]); + t = state[9] ^ d[4]; + bc[1] = std::rotl(t, ROT[9]); + + state[15] = bc[0] ^ (bc[2] & ~bc[1]); + state[1] = bc[1] ^ (bc[3] & ~bc[2]); + state[12] = bc[2] ^ (bc[4] & ~bc[3]); + state[23] = bc[3] ^ (bc[0] & ~bc[4]); + state[9] = bc[4] ^ (bc[1] & ~bc[0]); + + t = state[5] ^ d[0]; + bc[4] = std::rotl(t, ROT[20]); + t = state[16] ^ d[1]; + bc[0] = std::rotl(t, ROT[1]); + t = state[2] ^ d[2]; + bc[1] = std::rotl(t, ROT[7]); + t = state[13] ^ d[3]; + bc[2] = std::rotl(t, ROT[13]); + t = state[24] ^ d[4]; + bc[3] = std::rotl(t, ROT[19]); + + state[5] = bc[0] ^ (bc[2] & ~bc[1]); + state[16] = bc[1] ^ (bc[3] & ~bc[2]); + state[2] = bc[2] ^ (bc[4] & ~bc[3]); + state[13] = bc[3] ^ (bc[0] & ~bc[4]); + state[24] = bc[4] ^ (bc[1] & ~bc[0]); + + t = state[20] ^ d[0]; + bc[1] = std::rotl(t, ROT[5]); + t = state[6] ^ d[1]; + bc[2] = std::rotl(t, ROT[11]); + t = state[17] ^ d[2]; + bc[3] = std::rotl(t, ROT[17]); + t = state[3] ^ d[3]; + bc[4] = std::rotl(t, ROT[23]); + t = state[14] ^ d[4]; + bc[0] = std::rotl(t, ROT[4]); + + state[20] = bc[0] ^ (bc[2] & ~bc[1]); + state[6] = bc[1] ^ (bc[3] & ~bc[2]); + state[17] = bc[2] ^ (bc[4] & ~bc[3]); + state[3] = bc[3] ^ (bc[0] & ~bc[4]); + state[14] = bc[4] ^ (bc[1] & ~bc[0]); + + t = state[10] ^ d[0]; + bc[3] = std::rotl(t, ROT[15]); + t = state[21] ^ d[1]; + bc[4] = std::rotl(t, ROT[21]); + t = state[7] ^ d[2]; + bc[0] = std::rotl(t, ROT[2]); + t = state[18] ^ d[3]; + bc[1] = std::rotl(t, ROT[8]); + t = state[4] ^ d[4]; + bc[2] = std::rotl(t, ROT[14]); + + state[10] = bc[0] ^ (bc[2] & ~bc[1]); + state[21] = bc[1] ^ (bc[3] & ~bc[2]); + state[7] = bc[2] ^ (bc[4] & ~bc[3]); + state[18] = bc[3] ^ (bc[0] & ~bc[4]); + state[4] = bc[4] ^ (bc[1] & ~bc[0]); + + // Round ridx + 3 + std::fill(bc.begin(), bc.end(), 0x00); + +#if defined __clang__ +#pragma clang loop unroll(enable) +#pragma clang loop vectorize(enable) +#pragma clang loop interleave(enable) +#elif defined __GNUG__ +#pragma GCC unroll 5 +#pragma GCC ivdep +#endif + for (size_t i = 0; i < 25; i += 5) { + bc[0] ^= state[i + 0]; + bc[1] ^= state[i + 1]; + bc[2] ^= state[i + 2]; + bc[3] ^= state[i + 3]; + bc[4] ^= state[i + 4]; + } + + d[0] = bc[4] ^ std::rotl(bc[1], 1); + d[1] = bc[0] ^ std::rotl(bc[2], 1); + d[2] = bc[1] ^ std::rotl(bc[3], 1); + d[3] = bc[2] ^ std::rotl(bc[4], 1); + d[4] = bc[3] ^ std::rotl(bc[0], 1); + + bc[0] = state[0] ^ d[0]; + t = state[1] ^ d[1]; + bc[1] = std::rotl(t, ROT[6]); + t = state[2] ^ d[2]; + bc[2] = std::rotl(t, ROT[12]); + t = state[3] ^ d[3]; + bc[3] = std::rotl(t, ROT[18]); + t = state[4] ^ d[4]; + bc[4] = std::rotl(t, ROT[24]); + + state[0] = bc[0] ^ (bc[2] & ~bc[1]) ^ RC[ridx + 3]; + state[1] = bc[1] ^ (bc[3] & ~bc[2]); + state[2] = bc[2] ^ (bc[4] & ~bc[3]); + state[3] = bc[3] ^ (bc[0] & ~bc[4]); + state[4] = bc[4] ^ (bc[1] & ~bc[0]); + + t = state[5] ^ d[0]; + bc[2] = std::rotl(t, ROT[10]); + t = state[6] ^ d[1]; + bc[3] = std::rotl(t, ROT[16]); + t = state[7] ^ d[2]; + bc[4] = std::rotl(t, ROT[22]); + t = state[8] ^ d[3]; + bc[0] = std::rotl(t, ROT[3]); + t = state[9] ^ d[4]; + bc[1] = std::rotl(t, ROT[9]); + + state[5] = bc[0] ^ (bc[2] & ~bc[1]); + state[6] = bc[1] ^ (bc[3] & ~bc[2]); + state[7] = bc[2] ^ (bc[4] & ~bc[3]); + state[8] = bc[3] ^ (bc[0] & ~bc[4]); + state[9] = bc[4] ^ (bc[1] & ~bc[0]); + + t = state[10] ^ d[0]; + bc[4] = std::rotl(t, ROT[20]); + t = state[11] ^ d[1]; + bc[0] = std::rotl(t, ROT[1]); + t = state[12] ^ d[2]; + bc[1] = std::rotl(t, ROT[7]); + t = state[13] ^ d[3]; + bc[2] = std::rotl(t, ROT[13]); + t = state[14] ^ d[4]; + bc[3] = std::rotl(t, ROT[19]); + + state[10] = bc[0] ^ (bc[2] & ~bc[1]); + state[11] = bc[1] ^ (bc[3] & ~bc[2]); + state[12] = bc[2] ^ (bc[4] & ~bc[3]); + state[13] = bc[3] ^ (bc[0] & ~bc[4]); + state[14] = bc[4] ^ (bc[1] & ~bc[0]); + + t = state[15] ^ d[0]; + bc[1] = std::rotl(t, ROT[5]); + t = state[16] ^ d[1]; + bc[2] = std::rotl(t, ROT[11]); + t = state[17] ^ d[2]; + bc[3] = std::rotl(t, ROT[17]); + t = state[18] ^ d[3]; + bc[4] = std::rotl(t, ROT[23]); + t = state[19] ^ d[4]; + bc[0] = std::rotl(t, ROT[4]); + + state[15] = bc[0] ^ (bc[2] & ~bc[1]); + state[16] = bc[1] ^ (bc[3] & ~bc[2]); + state[17] = bc[2] ^ (bc[4] & ~bc[3]); + state[18] = bc[3] ^ (bc[0] & ~bc[4]); + state[19] = bc[4] ^ (bc[1] & ~bc[0]); + + t = state[20] ^ d[0]; + bc[3] = std::rotl(t, ROT[15]); + t = state[21] ^ d[1]; + bc[4] = std::rotl(t, ROT[21]); + t = state[22] ^ d[2]; + bc[0] = std::rotl(t, ROT[2]); + t = state[23] ^ d[3]; + bc[1] = std::rotl(t, ROT[8]); + t = state[24] ^ d[4]; + bc[2] = std::rotl(t, ROT[14]); + + state[20] = bc[0] ^ (bc[2] & ~bc[1]); + state[21] = bc[1] ^ (bc[3] & ~bc[2]); + state[22] = bc[2] ^ (bc[4] & ~bc[3]); + state[23] = bc[3] ^ (bc[0] & ~bc[4]); + state[24] = bc[4] ^ (bc[1] & ~bc[0]); +} + +#else // On everywhere else + +// Keccak-p[1600, 24] step mapping function θ, see section 3.2.1 of SHA3 +// specification https://dx.doi.org/10.6028/NIST.FIPS.202 +static inline constexpr void +theta(uint64_t* const state) +{ + uint64_t c[5]{}; + uint64_t d[5]; + +#if defined __clang__ +#pragma clang loop unroll(enable) +#pragma clang loop vectorize(enable) +#pragma clang loop interleave(enable) +#elif defined __GNUG__ +#pragma GCC unroll 5 +#pragma GCC ivdep #endif for (size_t i = 0; i < 25; i += 5) { c[0] ^= state[i + 0]; @@ -160,16 +621,12 @@ theta(uint64_t* const state) d[4] = c[3] ^ std::rotl(c[0], 1); #if defined __clang__ - // Following - // https://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations - #pragma clang loop unroll(enable) #pragma clang loop vectorize(enable) +#pragma clang loop interleave(enable) #elif defined __GNUG__ - // Following - // https://gcc.gnu.org/onlinedocs/gcc/Loop-Specific-Pragmas.html#Loop-Specific-Pragmas - #pragma GCC unroll 5 +#pragma GCC ivdep #endif for (size_t i = 0; i < 25; i += 5) { state[i + 0] ^= d[0]; @@ -182,20 +639,16 @@ theta(uint64_t* const state) // Keccak-p[1600, 24] step mapping function ρ, see section 3.2.2 of SHA3 // specification https://dx.doi.org/10.6028/NIST.FIPS.202 -inline static constexpr void +static inline constexpr void rho(uint64_t* const state) { #if defined __clang__ - // Following - // https://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations - #pragma clang loop unroll(enable) #pragma clang loop vectorize(enable) +#pragma clang loop interleave(enable) #elif defined __GNUG__ - // Following - // https://gcc.gnu.org/onlinedocs/gcc/Loop-Specific-Pragmas.html#Loop-Specific-Pragmas - #pragma GCC unroll 25 +#pragma GCC ivdep #endif for (size_t i = 0; i < 25; i++) { state[i] = std::rotl(state[i], ROT[i]); @@ -204,22 +657,18 @@ rho(uint64_t* const state) // Keccak-p[1600, 24] step mapping function π, see section 3.2.3 of SHA3 // specification https://dx.doi.org/10.6028/NIST.FIPS.202 -inline static constexpr void +static inline constexpr void pi(const uint64_t* const __restrict istate, // input permutation state uint64_t* const __restrict ostate // output permutation state ) { #if defined __clang__ - // Following - // https://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations - #pragma clang loop unroll(enable) #pragma clang loop vectorize(enable) +#pragma clang loop interleave(enable) #elif defined __GNUG__ - // Following - // https://gcc.gnu.org/onlinedocs/gcc/Loop-Specific-Pragmas.html#Loop-Specific-Pragmas - #pragma GCC unroll 25 +#pragma GCC ivdep #endif for (size_t i = 0; i < 25; i++) { ostate[i] = istate[PERM[i]]; @@ -228,20 +677,16 @@ pi(const uint64_t* const __restrict istate, // input permutation state // Keccak-p[1600, 24] step mapping function χ, see section 3.2.4 of SHA3 // specification https://dx.doi.org/10.6028/NIST.FIPS.202 -inline static constexpr void +static inline constexpr void chi(uint64_t* const state) { #if defined __clang__ - // Following - // https://clang.llvm.org/docs/LanguageExtensions.html#extensions-for-loop-hint-optimizations - #pragma clang loop unroll(enable) #pragma clang loop vectorize(enable) +#pragma clang loop interleave(enable) #elif defined __GNUG__ - // Following - // https://gcc.gnu.org/onlinedocs/gcc/Loop-Specific-Pragmas.html#Loop-Specific-Pragmas - #pragma GCC unroll 5 +#pragma GCC ivdep #endif for (size_t i = 0; i < 5; i++) { const size_t ix5 = i * 5; @@ -259,7 +704,7 @@ chi(uint64_t* const state) // Keccak-p[1600, 24] step mapping function ι, see section 3.2.5 of SHA3 // specification https://dx.doi.org/10.6028/NIST.FIPS.202 -inline static constexpr void +static inline constexpr void iota(uint64_t* const state, const size_t ridx) { state[0] ^= RC[ridx]; @@ -271,7 +716,7 @@ iota(uint64_t* const state, const size_t ridx) // it to apply round `i` - it first applies round `i` and then round `i+1`. // // See section 3.3 of https://dx.doi.org/10.6028/NIST.FIPS.202 -inline static constexpr void +static inline constexpr void roundx2(uint64_t* const state, const size_t ridx) { uint64_t tmp[LANE_CNT]{}; @@ -291,6 +736,8 @@ roundx2(uint64_t* const state, const size_t ridx) iota(state, ridx + 1); } +#endif + // Keccak-p[1600, 24] permutation, applying 24 rounds of permutation // on state of dimension 5 x 5 x 64 ( = 1600 ) -bits, using algorithm 7 // defined in section 3.3 of SHA3 specification @@ -298,9 +745,15 @@ roundx2(uint64_t* const state, const size_t ridx) inline constexpr void permute(uint64_t state[LANE_CNT]) { +#if defined __APPLE__ && defined __aarch64__ // On Apple Silicon + for (size_t i = 0; i < ROUNDS; i += 4) { + roundx4(state, i); + } +#else // On everywhere else for (size_t i = 0; i < ROUNDS; i += 2) { roundx2(state, i); } +#endif } }