Skip to content
Snippets Groups Projects
Commit 9aa70198 authored by Argyrios Kyrtzidis's avatar Argyrios Kyrtzidis
Browse files

[Support] Introduce the BLAKE3 hashing function implementation

BLAKE3 is a cryptographic hash function that is secure and very performant.
The C implementation originates from https://github.com/BLAKE3-team/BLAKE3/tree/1.3.1/c
License is at https://github.com/BLAKE3-team/BLAKE3/blob/1.3.1/LICENSE

This patch adds:

* `llvm/include/llvm-c/blake3.h`: The BLAKE3 C API
* `llvm/include/llvm/Support/BLAKE3.h`: C++ wrapper of the C API
* `llvm/lib/Support/BLAKE3`: Directory containing the BLAKE3 C implementation files, including the `LICENSE` file
* `llvm/unittests/Support/BLAKE3Test.cpp`: unit tests for the BLAKE3 C++ wrapper

This initial patch contains the pristine BLAKE3 sources, a follow-up patch will introduce
LLVM-specific prefixes to avoid conflicts if a client also links with its own BLAKE3 version.

And here's some timings comparing BLAKE3 with LLVM's SHA1/SHA256/MD5.
Timings include `AVX512`, `AVX2`, `neon`, and the generic/portable implementations.
The table shows the speed-up multiplier of BLAKE3 for hashing 100 MBs:

|        Processor        | SHA1  | SHA256 |  MD5 |
|-------------------------|-------|--------|------|
| Intel Xeon W (AVX512)   | 10.4x |   27x  | 9.4x |
| Intel Xeon W (AVX2)     | 6.5x  |   17x  | 5.9x |
| Intel Xeon W (portable) | 1.3x  |  3.3x  | 1.1x |
|      M1Pro (neon)       | 2.1x  |  4.7x  | 2.8x |
|      M1Pro (portable)   | 1.1x  |  2.4x  | 1.5x |

Differential Revision: https://reviews.llvm.org/D121510
parent 8722c12c
No related branches found
No related tags found
No related merge requests found
Showing
with 17414 additions and 0 deletions
#ifndef BLAKE3_H
#define BLAKE3_H
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
#define BLAKE3_VERSION_STRING "1.3.1"
#define BLAKE3_KEY_LEN 32
#define BLAKE3_OUT_LEN 32
#define BLAKE3_BLOCK_LEN 64
#define BLAKE3_CHUNK_LEN 1024
#define BLAKE3_MAX_DEPTH 54
// This struct is a private implementation detail. It has to be here because
// it's part of blake3_hasher below.
typedef struct {
uint32_t cv[8];
uint64_t chunk_counter;
uint8_t buf[BLAKE3_BLOCK_LEN];
uint8_t buf_len;
uint8_t blocks_compressed;
uint8_t flags;
} blake3_chunk_state;
typedef struct {
uint32_t key[8];
blake3_chunk_state chunk;
uint8_t cv_stack_len;
// The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
// with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
// requires a 4th entry, rather than merging everything down to 1, because we
// don't know whether more input is coming. This is different from how the
// reference implementation does things.
uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
} blake3_hasher;
const char *blake3_version(void);
void blake3_hasher_init(blake3_hasher *self);
void blake3_hasher_init_keyed(blake3_hasher *self,
const uint8_t key[BLAKE3_KEY_LEN]);
void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
size_t context_len);
void blake3_hasher_update(blake3_hasher *self, const void *input,
size_t input_len);
void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
size_t out_len);
void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
uint8_t *out, size_t out_len);
void blake3_hasher_reset(blake3_hasher *self);
#ifdef __cplusplus
}
#endif
#endif /* BLAKE3_H */
//==- BLAKE3.h - BLAKE3 C++ wrapper for LLVM ---------------------*- C++ -*-==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This is a C++ wrapper of the BLAKE3 C interface.
//
//===----------------------------------------------------------------------===//
#ifndef LLVM_SUPPORT_BLAKE3_H
#define LLVM_SUPPORT_BLAKE3_H
#include "llvm-c/blake3.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/StringRef.h"
namespace llvm {
/// The constant \p BLAKE3_OUT_LEN provides the default output length,
/// 32 bytes, which is recommended for most callers.
///
/// Outputs shorter than the default length of 32 bytes (256 bits) provide
/// less security. An N-bit BLAKE3 output is intended to provide N bits of
/// first and second preimage resistance and N/2 bits of collision
/// resistance, for any N up to 256. Longer outputs don't provide any
/// additional security.
///
/// Shorter BLAKE3 outputs are prefixes of longer ones. Explicitly
/// requesting a short output is equivalent to truncating the default-length
/// output.
template <size_t NumBytes = BLAKE3_OUT_LEN>
using BLAKE3Result = std::array<uint8_t, NumBytes>;
/// A class that wrap the BLAKE3 algorithm.
class BLAKE3 {
public:
BLAKE3() { init(); }
/// Reinitialize the internal state
void init() { blake3_hasher_init(&Hasher); }
/// Digest more data.
void update(ArrayRef<uint8_t> Data) {
blake3_hasher_update(&Hasher, Data.data(), Data.size());
}
/// Digest more data.
void update(StringRef Str) {
blake3_hasher_update(&Hasher, Str.data(), Str.size());
}
/// Finalize the hasher and put the result in \p Result.
/// This doesn't modify the hasher itself, and it's possible to finalize again
/// after adding more input.
template <size_t NumBytes = BLAKE3_OUT_LEN>
void final(BLAKE3Result<NumBytes> &Result) {
blake3_hasher_finalize(&Hasher, Result.data(), Result.size());
}
/// Finalize the hasher and return an output of any length, given in bytes.
/// This doesn't modify the hasher itself, and it's possible to finalize again
/// after adding more input.
template <size_t NumBytes = BLAKE3_OUT_LEN> BLAKE3Result<NumBytes> final() {
BLAKE3Result<NumBytes> Result;
blake3_hasher_finalize(&Hasher, Result.data(), Result.size());
return Result;
}
/// Returns a BLAKE3 hash for the given data.
template <size_t NumBytes = BLAKE3_OUT_LEN>
static BLAKE3Result<NumBytes> hash(ArrayRef<uint8_t> Data) {
BLAKE3 Hasher;
Hasher.update(Data);
return Hasher.final<NumBytes>();
}
private:
blake3_hasher Hasher;
};
} // namespace llvm
#endif
DisableFormat: true
SortIncludes: Never
set(LLVM_BLAKE3_FILES
BLAKE3/blake3.c
BLAKE3/blake3_dispatch.c
BLAKE3/blake3_portable.c
)
# The BLAKE3 team recommends using the assembly versions, from the README:
#
# "For each of the x86 SIMD instruction sets, four versions are available:
# three flavors of assembly (Unix, Windows MSVC, and Windows GNU) and one
# version using C intrinsics. The assembly versions are generally
# preferred. They perform better, they perform more consistently across
# different compilers, and they build more quickly."
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64)$")
if (MSVC)
list(APPEND LLVM_BLAKE3_FILES
BLAKE3/blake3_sse2_x86-64_windows_msvc.asm
BLAKE3/blake3_sse41_x86-64_windows_msvc.asm
BLAKE3/blake3_avx2_x86-64_windows_msvc.asm
BLAKE3/blake3_avx512_x86-64_windows_msvc.asm
)
elseif(WIN32)
list(APPEND LLVM_BLAKE3_FILES
BLAKE3/blake3_sse2_x86-64_windows_gnu.S
BLAKE3/blake3_sse41_x86-64_windows_gnu.S
BLAKE3/blake3_avx2_x86-64_windows_gnu.S
BLAKE3/blake3_avx512_x86-64_windows_gnu.S
)
else()
list(APPEND LLVM_BLAKE3_FILES
BLAKE3/blake3_sse2_x86-64_unix.S
BLAKE3/blake3_sse41_x86-64_unix.S
BLAKE3/blake3_avx2_x86-64_unix.S
BLAKE3/blake3_avx512_x86-64_unix.S
)
endif()
endif()
if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
list(APPEND LLVM_BLAKE3_FILES
BLAKE3/blake3_neon.c
)
endif()
set(LLVM_BLAKE3_FILES
${LLVM_BLAKE3_FILES}
PARENT_SCOPE
)
This work is released into the public domain with CC0 1.0. Alternatively, it is
licensed under the Apache License 2.0.
-------------------------------------------------------------------------------
Creative Commons Legal Code
CC0 1.0 Universal
CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
HEREUNDER.
Statement of Purpose
The laws of most jurisdictions throughout the world automatically confer
exclusive Copyright and Related Rights (defined below) upon the creator
and subsequent owner(s) (each and all, an "owner") of an original work of
authorship and/or a database (each, a "Work").
Certain owners wish to permanently relinquish those rights to a Work for
the purpose of contributing to a commons of creative, cultural and
scientific works ("Commons") that the public can reliably and without fear
of later claims of infringement build upon, modify, incorporate in other
works, reuse and redistribute as freely as possible in any form whatsoever
and for any purposes, including without limitation commercial purposes.
These owners may contribute to the Commons to promote the ideal of a free
culture and the further production of creative, cultural and scientific
works, or to gain reputation or greater distribution for their Work in
part through the use and efforts of others.
For these and/or other purposes and motivations, and without any
expectation of additional consideration or compensation, the person
associating CC0 with a Work (the "Affirmer"), to the extent that he or she
is an owner of Copyright and Related Rights in the Work, voluntarily
elects to apply CC0 to the Work and publicly distribute the Work under its
terms, with knowledge of his or her Copyright and Related Rights in the
Work and the meaning and intended legal effect of CC0 on those rights.
1. Copyright and Related Rights. A Work made available under CC0 may be
protected by copyright and related or neighboring rights ("Copyright and
Related Rights"). Copyright and Related Rights include, but are not
limited to, the following:
i. the right to reproduce, adapt, distribute, perform, display,
communicate, and translate a Work;
ii. moral rights retained by the original author(s) and/or performer(s);
iii. publicity and privacy rights pertaining to a person's image or
likeness depicted in a Work;
iv. rights protecting against unfair competition in regards to a Work,
subject to the limitations in paragraph 4(a), below;
v. rights protecting the extraction, dissemination, use and reuse of data
in a Work;
vi. database rights (such as those arising under Directive 96/9/EC of the
European Parliament and of the Council of 11 March 1996 on the legal
protection of databases, and under any national implementation
thereof, including any amended or successor version of such
directive); and
vii. other similar, equivalent or corresponding rights throughout the
world based on applicable law or treaty, and any national
implementations thereof.
2. Waiver. To the greatest extent permitted by, but not in contravention
of, applicable law, Affirmer hereby overtly, fully, permanently,
irrevocably and unconditionally waives, abandons, and surrenders all of
Affirmer's Copyright and Related Rights and associated claims and causes
of action, whether now known or unknown (including existing as well as
future claims and causes of action), in the Work (i) in all territories
worldwide, (ii) for the maximum duration provided by applicable law or
treaty (including future time extensions), (iii) in any current or future
medium and for any number of copies, and (iv) for any purpose whatsoever,
including without limitation commercial, advertising or promotional
purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
member of the public at large and to the detriment of Affirmer's heirs and
successors, fully intending that such Waiver shall not be subject to
revocation, rescission, cancellation, termination, or any other legal or
equitable action to disrupt the quiet enjoyment of the Work by the public
as contemplated by Affirmer's express Statement of Purpose.
3. Public License Fallback. Should any part of the Waiver for any reason
be judged legally invalid or ineffective under applicable law, then the
Waiver shall be preserved to the maximum extent permitted taking into
account Affirmer's express Statement of Purpose. In addition, to the
extent the Waiver is so judged Affirmer hereby grants to each affected
person a royalty-free, non transferable, non sublicensable, non exclusive,
irrevocable and unconditional license to exercise Affirmer's Copyright and
Related Rights in the Work (i) in all territories worldwide, (ii) for the
maximum duration provided by applicable law or treaty (including future
time extensions), (iii) in any current or future medium and for any number
of copies, and (iv) for any purpose whatsoever, including without
limitation commercial, advertising or promotional purposes (the
"License"). The License shall be deemed effective as of the date CC0 was
applied by Affirmer to the Work. Should any part of the License for any
reason be judged legally invalid or ineffective under applicable law, such
partial invalidity or ineffectiveness shall not invalidate the remainder
of the License, and in such case Affirmer hereby affirms that he or she
will not (i) exercise any of his or her remaining Copyright and Related
Rights in the Work or (ii) assert any associated claims and causes of
action with respect to the Work, in either case contrary to Affirmer's
express Statement of Purpose.
4. Limitations and Disclaimers.
a. No trademark or patent rights held by Affirmer are waived, abandoned,
surrendered, licensed or otherwise affected by this document.
b. Affirmer offers the Work as-is and makes no representations or
warranties of any kind concerning the Work, express, implied,
statutory or otherwise, including without limitation warranties of
title, merchantability, fitness for a particular purpose, non
infringement, or the absence of latent or other defects, accuracy, or
the present or absence of errors, whether or not discoverable, all to
the greatest extent permissible under applicable law.
c. Affirmer disclaims responsibility for clearing rights of other persons
that may apply to the Work or any use thereof, including without
limitation any person's Copyright and Related Rights in the Work.
Further, Affirmer disclaims responsibility for obtaining any necessary
consents, permissions or other rights required for any use of the
Work.
d. Affirmer understands and acknowledges that Creative Commons is not a
party to this document and has no duty or obligation with respect to
this CC0 or use of the Work.
-------------------------------------------------------------------------------
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2019 Jack O'Connor and Samuel Neves
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
The official C implementation of BLAKE3.
# Example
An example program that hashes bytes from standard input and prints the
result:
```c
#include "blake3.h"
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
int main() {
// Initialize the hasher.
blake3_hasher hasher;
blake3_hasher_init(&hasher);
// Read input bytes from stdin.
unsigned char buf[65536];
while (1) {
ssize_t n = read(STDIN_FILENO, buf, sizeof(buf));
if (n > 0) {
blake3_hasher_update(&hasher, buf, n);
} else if (n == 0) {
break; // end of file
} else {
fprintf(stderr, "read failed: %s\n", strerror(errno));
exit(1);
}
}
// Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes.
uint8_t output[BLAKE3_OUT_LEN];
blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN);
// Print the hash as hexadecimal.
for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) {
printf("%02x", output[i]);
}
printf("\n");
return 0;
}
```
The code above is included in this directory as `example.c`. If you're
on x86\_64 with a Unix-like OS, you can compile a working binary like
this:
```bash
gcc -O3 -o example example.c blake3.c blake3_dispatch.c blake3_portable.c \
blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \
blake3_avx512_x86-64_unix.S
```
# API
## The Struct
```c
typedef struct {
// private fields
} blake3_hasher;
```
An incremental BLAKE3 hashing state, which can accept any number of
updates. This implementation doesn't allocate any heap memory, but
`sizeof(blake3_hasher)` itself is relatively large, currently 1912 bytes
on x86-64. This size can be reduced by restricting the maximum input
length, as described in Section 5.4 of [the BLAKE3
spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf),
but this implementation doesn't currently support that strategy.
## Common API Functions
```c
void blake3_hasher_init(
blake3_hasher *self);
```
Initialize a `blake3_hasher` in the default hashing mode.
---
```c
void blake3_hasher_update(
blake3_hasher *self,
const void *input,
size_t input_len);
```
Add input to the hasher. This can be called any number of times.
---
```c
void blake3_hasher_finalize(
const blake3_hasher *self,
uint8_t *out,
size_t out_len);
```
Finalize the hasher and return an output of any length, given in bytes.
This doesn't modify the hasher itself, and it's possible to finalize
again after adding more input. The constant `BLAKE3_OUT_LEN` provides
the default output length, 32 bytes, which is recommended for most
callers.
Outputs shorter than the default length of 32 bytes (256 bits) provide
less security. An N-bit BLAKE3 output is intended to provide N bits of
first and second preimage resistance and N/2 bits of collision
resistance, for any N up to 256. Longer outputs don't provide any
additional security.
Shorter BLAKE3 outputs are prefixes of longer ones. Explicitly
requesting a short output is equivalent to truncating the default-length
output. (Note that this is different between BLAKE2 and BLAKE3.)
## Less Common API Functions
```c
void blake3_hasher_init_keyed(
blake3_hasher *self,
const uint8_t key[BLAKE3_KEY_LEN]);
```
Initialize a `blake3_hasher` in the keyed hashing mode. The key must be
exactly 32 bytes.
---
```c
void blake3_hasher_init_derive_key(
blake3_hasher *self,
const char *context);
```
Initialize a `blake3_hasher` in the key derivation mode. The context
string is given as an initialization parameter, and afterwards input key
material should be given with `blake3_hasher_update`. The context string
is a null-terminated C string which should be **hardcoded, globally
unique, and application-specific**. The context string should not
include any dynamic input like salts, nonces, or identifiers read from a
database at runtime. A good default format for the context string is
`"[application] [commit timestamp] [purpose]"`, e.g., `"example.com
2019-12-25 16:18:03 session tokens v1"`.
This function is intended for application code written in C. For
language bindings, see `blake3_hasher_init_derive_key_raw` below.
---
```c
void blake3_hasher_init_derive_key_raw(
blake3_hasher *self,
const void *context,
size_t context_len);
```
As `blake3_hasher_init_derive_key` above, except that the context string
is given as a pointer to an array of arbitrary bytes with a provided
length. This is intended for writing language bindings, where C string
conversion would add unnecessary overhead and new error cases. Unicode
strings should be encoded as UTF-8.
Application code in C should prefer `blake3_hasher_init_derive_key`,
which takes the context as a C string. If you need to use arbitrary
bytes as a context string in application code, consider whether you're
violating the requirement that context strings should be hardcoded.
---
```c
void blake3_hasher_finalize_seek(
const blake3_hasher *self,
uint64_t seek,
uint8_t *out,
size_t out_len);
```
The same as `blake3_hasher_finalize`, but with an additional `seek`
parameter for the starting byte position in the output stream. To
efficiently stream a large output without allocating memory, call this
function in a loop, incrementing `seek` by the output length each time.
---
```c
void blake3_hasher_reset(
blake3_hasher *self);
```
Reset the hasher to its initial state, prior to any calls to
`blake3_hasher_update`. Currently this is no different from calling
`blake3_hasher_init` or similar again. However, if this implementation gains
multithreading support in the future, and if `blake3_hasher` holds (optional)
threading resources, this function will reuse those resources. Until then, this
is mainly for feature compatibility with the Rust implementation.
# Building
This implementation is just C and assembly files. It doesn't include a
public-facing build system. (The `Makefile` in this directory is only
for testing.) Instead, the intention is that you can include these files
in whatever build system you're already using. This section describes
the commands your build system should execute, or which you can execute
by hand. Note that these steps may change in future versions.
## x86
Dynamic dispatch is enabled by default on x86. The implementation will
query the CPU at runtime to detect SIMD support, and it will use the
widest instruction set available. By default, `blake3_dispatch.c`
expects to be linked with code for five different instruction sets:
portable C, SSE2, SSE4.1, AVX2, and AVX-512.
For each of the x86 SIMD instruction sets, four versions are available:
three flavors of assembly (Unix, Windows MSVC, and Windows GNU) and one
version using C intrinsics. The assembly versions are generally
preferred. They perform better, they perform more consistently across
different compilers, and they build more quickly. On the other hand, the
assembly versions are x86\_64-only, and you need to select the right
flavor for your target platform.
Here's an example of building a shared library on x86\_64 Linux using
the assembly implementations:
```bash
gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \
blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \
blake3_avx512_x86-64_unix.S
```
When building the intrinsics-based implementations, you need to build
each implementation separately, with the corresponding instruction set
explicitly enabled in the compiler. Here's the same shared library using
the intrinsics-based implementations:
```bash
gcc -c -fPIC -O3 -msse2 blake3_sse2.c -o blake3_sse2.o
gcc -c -fPIC -O3 -msse4.1 blake3_sse41.c -o blake3_sse41.o
gcc -c -fPIC -O3 -mavx2 blake3_avx2.c -o blake3_avx2.o
gcc -c -fPIC -O3 -mavx512f -mavx512vl blake3_avx512.c -o blake3_avx512.o
gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \
blake3_avx2.o blake3_avx512.o blake3_sse41.o blake3_sse2.o
```
Note above that building `blake3_avx512.c` requires both `-mavx512f` and
`-mavx512vl` under GCC and Clang. Under MSVC, the single `/arch:AVX512`
flag is sufficient. The MSVC equivalent of `-mavx2` is `/arch:AVX2`.
MSVC enables SSE2 and SSE4.1 by defaut, and it doesn't have a
corresponding flag.
If you want to omit SIMD code entirely, you need to explicitly disable
each instruction set. Here's an example of building a shared library on
x86 with only portable code:
```bash
gcc -shared -O3 -o libblake3.so -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 \
-DBLAKE3_NO_AVX512 blake3.c blake3_dispatch.c blake3_portable.c
```
## ARM NEON
The NEON implementation is enabled by default on AArch64, but not on
other ARM targets, since not all of them support it. To enable it, set
`BLAKE3_USE_NEON=1`. Here's an example of building a shared library on
ARM Linux with NEON support:
```bash
gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON=1 blake3.c blake3_dispatch.c \
blake3_portable.c blake3_neon.c
```
To explicitiy disable using NEON instructions on AArch64, set
`BLAKE3_USE_NEON=0`.
```bash
gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON=0 blake3.c blake3_dispatch.c \
blake3_portable.c
```
Note that on some targets (ARMv7 in particular), extra flags may be
required to activate NEON support in the compiler. If you see an error
like...
```
/usr/lib/gcc/armv7l-unknown-linux-gnueabihf/9.2.0/include/arm_neon.h:635:1: error: inlining failed
in call to always_inline ‘vaddq_u32’: target specific option mismatch
```
...then you may need to add something like `-mfpu=neon-vfpv4
-mfloat-abi=hard`.
## Other Platforms
The portable implementation should work on most other architectures. For
example:
```bash
gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c
```
# Multithreading
Unlike the Rust implementation, the C implementation doesn't currently support
multithreading. A future version of this library could add support by taking an
optional dependency on OpenMP or similar. Alternatively, we could expose a
lower-level API to allow callers to implement concurrency themselves. The
former would be more convenient and less error-prone, but the latter would give
callers the maximum possible amount of control. The best choice here depends on
the specific use case, so if you have a use case for multithreaded hashing in
C, please file a GitHub issue and let us know.
This diff is collapsed.
#ifndef BLAKE3_H
#define BLAKE3_H
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
#define BLAKE3_VERSION_STRING "1.3.1"
#define BLAKE3_KEY_LEN 32
#define BLAKE3_OUT_LEN 32
#define BLAKE3_BLOCK_LEN 64
#define BLAKE3_CHUNK_LEN 1024
#define BLAKE3_MAX_DEPTH 54
// This struct is a private implementation detail. It has to be here because
// it's part of blake3_hasher below.
typedef struct {
uint32_t cv[8];
uint64_t chunk_counter;
uint8_t buf[BLAKE3_BLOCK_LEN];
uint8_t buf_len;
uint8_t blocks_compressed;
uint8_t flags;
} blake3_chunk_state;
typedef struct {
uint32_t key[8];
blake3_chunk_state chunk;
uint8_t cv_stack_len;
// The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
// with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
// requires a 4th entry, rather than merging everything down to 1, because we
// don't know whether more input is coming. This is different from how the
// reference implementation does things.
uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
} blake3_hasher;
const char *blake3_version(void);
void blake3_hasher_init(blake3_hasher *self);
void blake3_hasher_init_keyed(blake3_hasher *self,
const uint8_t key[BLAKE3_KEY_LEN]);
void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
size_t context_len);
void blake3_hasher_update(blake3_hasher *self, const void *input,
size_t input_len);
void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
size_t out_len);
void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
uint8_t *out, size_t out_len);
void blake3_hasher_reset(blake3_hasher *self);
#ifdef __cplusplus
}
#endif
#endif /* BLAKE3_H */
#include "blake3_impl.h"
#include <immintrin.h>
#define DEGREE 8
INLINE __m256i loadu(const uint8_t src[32]) {
return _mm256_loadu_si256((const __m256i *)src);
}
INLINE void storeu(__m256i src, uint8_t dest[16]) {
_mm256_storeu_si256((__m256i *)dest, src);
}
INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
// Note that clang-format doesn't like the name "xor" for some reason.
INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
INLINE __m256i rot16(__m256i x) {
return _mm256_shuffle_epi8(
x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
}
INLINE __m256i rot12(__m256i x) {
return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12));
}
INLINE __m256i rot8(__m256i x) {
return _mm256_shuffle_epi8(
x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1,
12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
}
INLINE __m256i rot7(__m256i x) {
return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7));
}
INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) {
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
v[0] = addv(v[0], v[4]);
v[1] = addv(v[1], v[5]);
v[2] = addv(v[2], v[6]);
v[3] = addv(v[3], v[7]);
v[12] = xorv(v[12], v[0]);
v[13] = xorv(v[13], v[1]);
v[14] = xorv(v[14], v[2]);
v[15] = xorv(v[15], v[3]);
v[12] = rot16(v[12]);
v[13] = rot16(v[13]);
v[14] = rot16(v[14]);
v[15] = rot16(v[15]);
v[8] = addv(v[8], v[12]);
v[9] = addv(v[9], v[13]);
v[10] = addv(v[10], v[14]);
v[11] = addv(v[11], v[15]);
v[4] = xorv(v[4], v[8]);
v[5] = xorv(v[5], v[9]);
v[6] = xorv(v[6], v[10]);
v[7] = xorv(v[7], v[11]);
v[4] = rot12(v[4]);
v[5] = rot12(v[5]);
v[6] = rot12(v[6]);
v[7] = rot12(v[7]);
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
v[0] = addv(v[0], v[4]);
v[1] = addv(v[1], v[5]);
v[2] = addv(v[2], v[6]);
v[3] = addv(v[3], v[7]);
v[12] = xorv(v[12], v[0]);
v[13] = xorv(v[13], v[1]);
v[14] = xorv(v[14], v[2]);
v[15] = xorv(v[15], v[3]);
v[12] = rot8(v[12]);
v[13] = rot8(v[13]);
v[14] = rot8(v[14]);
v[15] = rot8(v[15]);
v[8] = addv(v[8], v[12]);
v[9] = addv(v[9], v[13]);
v[10] = addv(v[10], v[14]);
v[11] = addv(v[11], v[15]);
v[4] = xorv(v[4], v[8]);
v[5] = xorv(v[5], v[9]);
v[6] = xorv(v[6], v[10]);
v[7] = xorv(v[7], v[11]);
v[4] = rot7(v[4]);
v[5] = rot7(v[5]);
v[6] = rot7(v[6]);
v[7] = rot7(v[7]);
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
v[0] = addv(v[0], v[5]);
v[1] = addv(v[1], v[6]);
v[2] = addv(v[2], v[7]);
v[3] = addv(v[3], v[4]);
v[15] = xorv(v[15], v[0]);
v[12] = xorv(v[12], v[1]);
v[13] = xorv(v[13], v[2]);
v[14] = xorv(v[14], v[3]);
v[15] = rot16(v[15]);
v[12] = rot16(v[12]);
v[13] = rot16(v[13]);
v[14] = rot16(v[14]);
v[10] = addv(v[10], v[15]);
v[11] = addv(v[11], v[12]);
v[8] = addv(v[8], v[13]);
v[9] = addv(v[9], v[14]);
v[5] = xorv(v[5], v[10]);
v[6] = xorv(v[6], v[11]);
v[7] = xorv(v[7], v[8]);
v[4] = xorv(v[4], v[9]);
v[5] = rot12(v[5]);
v[6] = rot12(v[6]);
v[7] = rot12(v[7]);
v[4] = rot12(v[4]);
v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
v[0] = addv(v[0], v[5]);
v[1] = addv(v[1], v[6]);
v[2] = addv(v[2], v[7]);
v[3] = addv(v[3], v[4]);
v[15] = xorv(v[15], v[0]);
v[12] = xorv(v[12], v[1]);
v[13] = xorv(v[13], v[2]);
v[14] = xorv(v[14], v[3]);
v[15] = rot8(v[15]);
v[12] = rot8(v[12]);
v[13] = rot8(v[13]);
v[14] = rot8(v[14]);
v[10] = addv(v[10], v[15]);
v[11] = addv(v[11], v[12]);
v[8] = addv(v[8], v[13]);
v[9] = addv(v[9], v[14]);
v[5] = xorv(v[5], v[10]);
v[6] = xorv(v[6], v[11]);
v[7] = xorv(v[7], v[8]);
v[4] = xorv(v[4], v[9]);
v[5] = rot7(v[5]);
v[6] = rot7(v[6]);
v[7] = rot7(v[7]);
v[4] = rot7(v[4]);
}
INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
// Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
// is 22/33/66/77.
__m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
__m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
__m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
__m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
__m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
__m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
__m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
__m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
// Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
// 11/33.
__m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
__m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
__m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
__m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
__m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
__m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
__m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
__m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
// Interleave 128-bit lanes.
vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
}
INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
size_t block_offset, __m256i out[16]) {
out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
for (size_t i = 0; i < 8; ++i) {
_mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
}
transpose_vecs(&out[0]);
transpose_vecs(&out[8]);
}
INLINE void load_counters(uint64_t counter, bool increment_counter,
__m256i *out_lo, __m256i *out_hi) {
const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter);
const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
const __m256i add1 = _mm256_and_si256(mask, add0);
__m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
__m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)),
_mm256_xor_si256( l, _mm256_set1_epi32(0x80000000)));
__m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry);
*out_lo = l;
*out_hi = h;
}
static
void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks,
const uint32_t key[8], uint64_t counter,
bool increment_counter, uint8_t flags,
uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
__m256i h_vecs[8] = {
set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
};
__m256i counter_low_vec, counter_high_vec;
load_counters(counter, increment_counter, &counter_low_vec,
&counter_high_vec);
uint8_t block_flags = flags | flags_start;
for (size_t block = 0; block < blocks; block++) {
if (block + 1 == blocks) {
block_flags |= flags_end;
}
__m256i block_len_vec = set1(BLAKE3_BLOCK_LEN);
__m256i block_flags_vec = set1(block_flags);
__m256i msg_vecs[16];
transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
__m256i v[16] = {
h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
set1(IV[0]), set1(IV[1]), set1(IV[2]), set1(IV[3]),
counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
};
round_fn(v, msg_vecs, 0);
round_fn(v, msg_vecs, 1);
round_fn(v, msg_vecs, 2);
round_fn(v, msg_vecs, 3);
round_fn(v, msg_vecs, 4);
round_fn(v, msg_vecs, 5);
round_fn(v, msg_vecs, 6);
h_vecs[0] = xorv(v[0], v[8]);
h_vecs[1] = xorv(v[1], v[9]);
h_vecs[2] = xorv(v[2], v[10]);
h_vecs[3] = xorv(v[3], v[11]);
h_vecs[4] = xorv(v[4], v[12]);
h_vecs[5] = xorv(v[5], v[13]);
h_vecs[6] = xorv(v[6], v[14]);
h_vecs[7] = xorv(v[7], v[15]);
block_flags = flags;
}
transpose_vecs(h_vecs);
storeu(h_vecs[0], &out[0 * sizeof(__m256i)]);
storeu(h_vecs[1], &out[1 * sizeof(__m256i)]);
storeu(h_vecs[2], &out[2 * sizeof(__m256i)]);
storeu(h_vecs[3], &out[3 * sizeof(__m256i)]);
storeu(h_vecs[4], &out[4 * sizeof(__m256i)]);
storeu(h_vecs[5], &out[5 * sizeof(__m256i)]);
storeu(h_vecs[6], &out[6 * sizeof(__m256i)]);
storeu(h_vecs[7], &out[7 * sizeof(__m256i)]);
}
#if !defined(BLAKE3_NO_SSE41)
void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
#else
void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out);
#endif
void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
size_t blocks, const uint32_t key[8],
uint64_t counter, bool increment_counter,
uint8_t flags, uint8_t flags_start,
uint8_t flags_end, uint8_t *out) {
while (num_inputs >= DEGREE) {
blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags,
flags_start, flags_end, out);
if (increment_counter) {
counter += DEGREE;
}
inputs += DEGREE;
num_inputs -= DEGREE;
out = &out[DEGREE * BLAKE3_OUT_LEN];
}
#if !defined(BLAKE3_NO_SSE41)
blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end, out);
#else
blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
#endif
}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment