[Support] Introduce the BLAKE3 hashing function implementation

BLAKE3 is a cryptographic hash function that is secure and very performant. The C implementation originates from https://github.com/BLAKE3-team/BLAKE3/tree/1.3.1/c License is at https://github.com/BLAKE3-team/BLAKE3/blob/1.3.1/LICENSE This patch adds: * `llvm/include/llvm-c/blake3.h`: The BLAKE3 C API * `llvm/include/llvm/Support/BLAKE3.h`: C++ wrapper of the C API * `llvm/lib/Support/BLAKE3`: Directory containing the BLAKE3 C implementation files, including the `LICENSE` file * `llvm/unittests/Support/BLAKE3Test.cpp`: unit tests for the BLAKE3 C++ wrapper This initial patch contains the pristine BLAKE3 sources, a follow-up patch will introduce LLVM-specific prefixes to avoid conflicts if a client also links with its own BLAKE3 version. And here's some timings comparing BLAKE3 with LLVM's SHA1/SHA256/MD5. Timings include `AVX512`, `AVX2`, `neon`, and the generic/portable implementations. The table shows the speed-up multiplier of BLAKE3 for hashing 100 MBs: | Processor | SHA1 | SHA256 | MD5 | |-------------------------|-------|--------|------| | Intel Xeon W (AVX512) | 10.4x | 27x | 9.4x | | Intel Xeon W (AVX2) | 6.5x | 17x | 5.9x | | Intel Xeon W (portable) | 1.3x | 3.3x | 1.1x | | M1Pro (neon) | 2.1x | 4.7x | 2.8x | | M1Pro (portable) | 1.1x | 2.4x | 1.5x | Differential Revision: https://reviews.llvm.org/D121510

[Support] Introduce the BLAKE3 hashing function implementation
9aa70198 · Argyrios Kyrtzidis · 8722c12c · 9aa70198 · 9aa70198 · 9aa70198
Commit 9aa70198 authored 3 years ago by Argyrios Kyrtzidis
--- a/llvm/include/llvm-c/blake3.h
+++ b/llvm/include/llvm-c/blake3.h
+#ifndef BLAKE3_H
+#define BLAKE3_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BLAKE3_VERSION_STRING "1.3.1"
+#define BLAKE3_KEY_LEN 32
+#define BLAKE3_OUT_LEN 32
+#define BLAKE3_BLOCK_LEN 64
+#define BLAKE3_CHUNK_LEN 1024
+#define BLAKE3_MAX_DEPTH 54
+
+// This struct is a private implementation detail. It has to be here because
+// it's part of blake3_hasher below.
+typedef struct {
+  uint32_t cv[8];
+  uint64_t chunk_counter;
+  uint8_t buf[BLAKE3_BLOCK_LEN];
+  uint8_t buf_len;
+  uint8_t blocks_compressed;
+  uint8_t flags;
+} blake3_chunk_state;
+
+typedef struct {
+  uint32_t key[8];
+  blake3_chunk_state chunk;
+  uint8_t cv_stack_len;
+  // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
+  // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
+  // requires a 4th entry, rather than merging everything down to 1, because we
+  // don't know whether more input is coming. This is different from how the
+  // reference implementation does things.
+  uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
+} blake3_hasher;
+
+const char *blake3_version(void);
+void blake3_hasher_init(blake3_hasher *self);
+void blake3_hasher_init_keyed(blake3_hasher *self,
+                              const uint8_t key[BLAKE3_KEY_LEN]);
+void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
+void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
+                                       size_t context_len);
+void blake3_hasher_update(blake3_hasher *self, const void *input,
+                          size_t input_len);
+void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
+                            size_t out_len);
+void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
+                                 uint8_t *out, size_t out_len);
+void blake3_hasher_reset(blake3_hasher *self);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BLAKE3_H */
--- a/llvm/include/llvm/Support/BLAKE3.h
+++ b/llvm/include/llvm/Support/BLAKE3.h
+//==- BLAKE3.h - BLAKE3 C++ wrapper for LLVM ---------------------*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a C++ wrapper of the BLAKE3 C interface.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_BLAKE3_H
+#define LLVM_SUPPORT_BLAKE3_H
+
+#include "llvm-c/blake3.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+
+/// The constant \p BLAKE3_OUT_LEN provides the default output length,
+/// 32 bytes, which is recommended for most callers.
+///
+/// Outputs shorter than the default length of 32 bytes (256 bits) provide
+/// less security. An N-bit BLAKE3 output is intended to provide N bits of
+/// first and second preimage resistance and N/2 bits of collision
+/// resistance, for any N up to 256. Longer outputs don't provide any
+/// additional security.
+///
+/// Shorter BLAKE3 outputs are prefixes of longer ones. Explicitly
+/// requesting a short output is equivalent to truncating the default-length
+/// output.
+template <size_t NumBytes = BLAKE3_OUT_LEN>
+using BLAKE3Result = std::array<uint8_t, NumBytes>;
+
+/// A class that wrap the BLAKE3 algorithm.
+class BLAKE3 {
+public:
+  BLAKE3() { init(); }
+
+  /// Reinitialize the internal state
+  void init() { blake3_hasher_init(&Hasher); }
+
+  /// Digest more data.
+  void update(ArrayRef<uint8_t> Data) {
+    blake3_hasher_update(&Hasher, Data.data(), Data.size());
+  }
+
+  /// Digest more data.
+  void update(StringRef Str) {
+    blake3_hasher_update(&Hasher, Str.data(), Str.size());
+  }
+
+  /// Finalize the hasher and put the result in \p Result.
+  /// This doesn't modify the hasher itself, and it's possible to finalize again
+  /// after adding more input.
+  template <size_t NumBytes = BLAKE3_OUT_LEN>
+  void final(BLAKE3Result<NumBytes> &Result) {
+    blake3_hasher_finalize(&Hasher, Result.data(), Result.size());
+  }
+
+  /// Finalize the hasher and return an output of any length, given in bytes.
+  /// This doesn't modify the hasher itself, and it's possible to finalize again
+  /// after adding more input.
+  template <size_t NumBytes = BLAKE3_OUT_LEN> BLAKE3Result<NumBytes> final() {
+    BLAKE3Result<NumBytes> Result;
+    blake3_hasher_finalize(&Hasher, Result.data(), Result.size());
+    return Result;
+  }
+
+  /// Returns a BLAKE3 hash for the given data.
+  template <size_t NumBytes = BLAKE3_OUT_LEN>
+  static BLAKE3Result<NumBytes> hash(ArrayRef<uint8_t> Data) {
+    BLAKE3 Hasher;
+    Hasher.update(Data);
+    return Hasher.final<NumBytes>();
+  }
+
+private:
+  blake3_hasher Hasher;
+};
+
+} // namespace llvm
+
+#endif
--- a/llvm/lib/Support/BLAKE3/.clang-format
+++ b/llvm/lib/Support/BLAKE3/.clang-format
+DisableFormat: true
+SortIncludes: Never
--- a/llvm/lib/Support/BLAKE3/CMakeLists.txt
+++ b/llvm/lib/Support/BLAKE3/CMakeLists.txt
+set(LLVM_BLAKE3_FILES
+  BLAKE3/blake3.c
+  BLAKE3/blake3_dispatch.c
+  BLAKE3/blake3_portable.c
+)
+
+# The BLAKE3 team recommends using the assembly versions, from the README:
+#
+# "For each of the x86 SIMD instruction sets, four versions are available:
+# three flavors of assembly (Unix, Windows MSVC, and Windows GNU) and one
+# version using C intrinsics. The assembly versions are generally
+# preferred. They perform better, they perform more consistently across
+# different compilers, and they build more quickly."
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64)$")
+  if (MSVC)
+    list(APPEND LLVM_BLAKE3_FILES
+      BLAKE3/blake3_sse2_x86-64_windows_msvc.asm
+      BLAKE3/blake3_sse41_x86-64_windows_msvc.asm
+      BLAKE3/blake3_avx2_x86-64_windows_msvc.asm
+      BLAKE3/blake3_avx512_x86-64_windows_msvc.asm
+    )
+  elseif(WIN32)
+    list(APPEND LLVM_BLAKE3_FILES
+      BLAKE3/blake3_sse2_x86-64_windows_gnu.S
+      BLAKE3/blake3_sse41_x86-64_windows_gnu.S
+      BLAKE3/blake3_avx2_x86-64_windows_gnu.S
+      BLAKE3/blake3_avx512_x86-64_windows_gnu.S
+    )
+  else()
+    list(APPEND LLVM_BLAKE3_FILES
+      BLAKE3/blake3_sse2_x86-64_unix.S
+      BLAKE3/blake3_sse41_x86-64_unix.S
+      BLAKE3/blake3_avx2_x86-64_unix.S
+      BLAKE3/blake3_avx512_x86-64_unix.S
+    )
+  endif()
+endif()
+
+if (CMAKE_SYSTEM_PROCESSOR MATCHES "arm")
+  list(APPEND LLVM_BLAKE3_FILES
+    BLAKE3/blake3_neon.c
+  )
+endif()
+
+set(LLVM_BLAKE3_FILES
+  ${LLVM_BLAKE3_FILES}
+  PARENT_SCOPE
+)
--- a/llvm/lib/Support/BLAKE3/LICENSE
+++ b/llvm/lib/Support/BLAKE3/LICENSE
+This work is released into the public domain with CC0 1.0. Alternatively, it is
+licensed under the Apache License 2.0.
+
+-------------------------------------------------------------------------------
+
+Creative Commons Legal Code
+
+CC0 1.0 Universal
+
+    CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
+    LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
+    ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
+    INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
+    REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
+    PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
+    THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
+    HEREUNDER.
+
+Statement of Purpose
+
+The laws of most jurisdictions throughout the world automatically confer
+exclusive Copyright and Related Rights (defined below) upon the creator
+and subsequent owner(s) (each and all, an "owner") of an original work of
+authorship and/or a database (each, a "Work").
+
+Certain owners wish to permanently relinquish those rights to a Work for
+the purpose of contributing to a commons of creative, cultural and
+scientific works ("Commons") that the public can reliably and without fear
+of later claims of infringement build upon, modify, incorporate in other
+works, reuse and redistribute as freely as possible in any form whatsoever
+and for any purposes, including without limitation commercial purposes.
+These owners may contribute to the Commons to promote the ideal of a free
+culture and the further production of creative, cultural and scientific
+works, or to gain reputation or greater distribution for their Work in
+part through the use and efforts of others.
+
+For these and/or other purposes and motivations, and without any
+expectation of additional consideration or compensation, the person
+associating CC0 with a Work (the "Affirmer"), to the extent that he or she
+is an owner of Copyright and Related Rights in the Work, voluntarily
+elects to apply CC0 to the Work and publicly distribute the Work under its
+terms, with knowledge of his or her Copyright and Related Rights in the
+Work and the meaning and intended legal effect of CC0 on those rights.
+
+1. Copyright and Related Rights. A Work made available under CC0 may be
+protected by copyright and related or neighboring rights ("Copyright and
+Related Rights"). Copyright and Related Rights include, but are not
+limited to, the following:
+
+  i. the right to reproduce, adapt, distribute, perform, display,
+     communicate, and translate a Work;
+ ii. moral rights retained by the original author(s) and/or performer(s);
+iii. publicity and privacy rights pertaining to a person's image or
+     likeness depicted in a Work;
+ iv. rights protecting against unfair competition in regards to a Work,
+     subject to the limitations in paragraph 4(a), below;
+  v. rights protecting the extraction, dissemination, use and reuse of data
+     in a Work;
+ vi. database rights (such as those arising under Directive 96/9/EC of the
+     European Parliament and of the Council of 11 March 1996 on the legal
+     protection of databases, and under any national implementation
+     thereof, including any amended or successor version of such
+     directive); and
+vii. other similar, equivalent or corresponding rights throughout the
+     world based on applicable law or treaty, and any national
+     implementations thereof.
+
+2. Waiver. To the greatest extent permitted by, but not in contravention
+of, applicable law, Affirmer hereby overtly, fully, permanently,
+irrevocably and unconditionally waives, abandons, and surrenders all of
+Affirmer's Copyright and Related Rights and associated claims and causes
+of action, whether now known or unknown (including existing as well as
+future claims and causes of action), in the Work (i) in all territories
+worldwide, (ii) for the maximum duration provided by applicable law or
+treaty (including future time extensions), (iii) in any current or future
+medium and for any number of copies, and (iv) for any purpose whatsoever,
+including without limitation commercial, advertising or promotional
+purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
+member of the public at large and to the detriment of Affirmer's heirs and
+successors, fully intending that such Waiver shall not be subject to
+revocation, rescission, cancellation, termination, or any other legal or
+equitable action to disrupt the quiet enjoyment of the Work by the public
+as contemplated by Affirmer's express Statement of Purpose.
+
+3. Public License Fallback. Should any part of the Waiver for any reason
+be judged legally invalid or ineffective under applicable law, then the
+Waiver shall be preserved to the maximum extent permitted taking into
+account Affirmer's express Statement of Purpose. In addition, to the
+extent the Waiver is so judged Affirmer hereby grants to each affected
+person a royalty-free, non transferable, non sublicensable, non exclusive,
+irrevocable and unconditional license to exercise Affirmer's Copyright and
+Related Rights in the Work (i) in all territories worldwide, (ii) for the
+maximum duration provided by applicable law or treaty (including future
+time extensions), (iii) in any current or future medium and for any number
+of copies, and (iv) for any purpose whatsoever, including without
+limitation commercial, advertising or promotional purposes (the
+"License"). The License shall be deemed effective as of the date CC0 was
+applied by Affirmer to the Work. Should any part of the License for any
+reason be judged legally invalid or ineffective under applicable law, such
+partial invalidity or ineffectiveness shall not invalidate the remainder
+of the License, and in such case Affirmer hereby affirms that he or she
+will not (i) exercise any of his or her remaining Copyright and Related
+Rights in the Work or (ii) assert any associated claims and causes of
+action with respect to the Work, in either case contrary to Affirmer's
+express Statement of Purpose.
+
+4. Limitations and Disclaimers.
+
+ a. No trademark or patent rights held by Affirmer are waived, abandoned,
+    surrendered, licensed or otherwise affected by this document.
+ b. Affirmer offers the Work as-is and makes no representations or
+    warranties of any kind concerning the Work, express, implied,
+    statutory or otherwise, including without limitation warranties of
+    title, merchantability, fitness for a particular purpose, non
+    infringement, or the absence of latent or other defects, accuracy, or
+    the present or absence of errors, whether or not discoverable, all to
+    the greatest extent permissible under applicable law.
+ c. Affirmer disclaims responsibility for clearing rights of other persons
+    that may apply to the Work or any use thereof, including without
+    limitation any person's Copyright and Related Rights in the Work.
+    Further, Affirmer disclaims responsibility for obtaining any necessary
+    consents, permissions or other rights required for any use of the
+    Work.
+ d. Affirmer understands and acknowledges that Creative Commons is not a
+    party to this document and has no duty or obligation with respect to
+    this CC0 or use of the Work.
+
+-------------------------------------------------------------------------------
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2019 Jack O'Connor and Samuel Neves
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/llvm/lib/Support/BLAKE3/README.md
+++ b/llvm/lib/Support/BLAKE3/README.md
+The official C implementation of BLAKE3.
+
+# Example
+
+An example program that hashes bytes from standard input and prints the
+result:
+
+```c
+#include "blake3.h"
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+int main() {
+  // Initialize the hasher.
+  blake3_hasher hasher;
+  blake3_hasher_init(&hasher);
+
+  // Read input bytes from stdin.
+  unsigned char buf[65536];
+  while (1) {
+    ssize_t n = read(STDIN_FILENO, buf, sizeof(buf));
+    if (n > 0) {
+      blake3_hasher_update(&hasher, buf, n);
+    } else if (n == 0) {
+      break; // end of file
+    } else {
+      fprintf(stderr, "read failed: %s\n", strerror(errno));
+      exit(1);
+    }
+  }
+
+  // Finalize the hash. BLAKE3_OUT_LEN is the default output length, 32 bytes.
+  uint8_t output[BLAKE3_OUT_LEN];
+  blake3_hasher_finalize(&hasher, output, BLAKE3_OUT_LEN);
+
+  // Print the hash as hexadecimal.
+  for (size_t i = 0; i < BLAKE3_OUT_LEN; i++) {
+    printf("%02x", output[i]);
+  }
+  printf("\n");
+  return 0;
+}
+```
+
+The code above is included in this directory as `example.c`. If you're
+on x86\_64 with a Unix-like OS, you can compile a working binary like
+this:
+
+```bash
+gcc -O3 -o example example.c blake3.c blake3_dispatch.c blake3_portable.c \
+    blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \
+    blake3_avx512_x86-64_unix.S
+```
+
+# API
+
+## The Struct
+
+```c
+typedef struct {
+  // private fields
+} blake3_hasher;
+```
+
+An incremental BLAKE3 hashing state, which can accept any number of
+updates. This implementation doesn't allocate any heap memory, but
+`sizeof(blake3_hasher)` itself is relatively large, currently 1912 bytes
+on x86-64. This size can be reduced by restricting the maximum input
+length, as described in Section 5.4 of [the BLAKE3
+spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf),
+but this implementation doesn't currently support that strategy.
+
+## Common API Functions
+
+```c
+void blake3_hasher_init(
+  blake3_hasher *self);
+```
+
+Initialize a `blake3_hasher` in the default hashing mode.
+
+---
+
+```c
+void blake3_hasher_update(
+  blake3_hasher *self,
+  const void *input,
+  size_t input_len);
+```
+
+Add input to the hasher. This can be called any number of times.
+
+---
+
+```c
+void blake3_hasher_finalize(
+  const blake3_hasher *self,
+  uint8_t *out,
+  size_t out_len);
+```
+
+Finalize the hasher and return an output of any length, given in bytes.
+This doesn't modify the hasher itself, and it's possible to finalize
+again after adding more input. The constant `BLAKE3_OUT_LEN` provides
+the default output length, 32 bytes, which is recommended for most
+callers.
+
+Outputs shorter than the default length of 32 bytes (256 bits) provide
+less security. An N-bit BLAKE3 output is intended to provide N bits of
+first and second preimage resistance and N/2 bits of collision
+resistance, for any N up to 256. Longer outputs don't provide any
+additional security.
+
+Shorter BLAKE3 outputs are prefixes of longer ones. Explicitly
+requesting a short output is equivalent to truncating the default-length
+output. (Note that this is different between BLAKE2 and BLAKE3.)
+
+## Less Common API Functions
+
+```c
+void blake3_hasher_init_keyed(
+  blake3_hasher *self,
+  const uint8_t key[BLAKE3_KEY_LEN]);
+```
+
+Initialize a `blake3_hasher` in the keyed hashing mode. The key must be
+exactly 32 bytes.
+
+---
+
+```c
+void blake3_hasher_init_derive_key(
+  blake3_hasher *self,
+  const char *context);
+```
+
+Initialize a `blake3_hasher` in the key derivation mode. The context
+string is given as an initialization parameter, and afterwards input key
+material should be given with `blake3_hasher_update`. The context string
+is a null-terminated C string which should be **hardcoded, globally
+unique, and application-specific**. The context string should not
+include any dynamic input like salts, nonces, or identifiers read from a
+database at runtime. A good default format for the context string is
+`"[application] [commit timestamp] [purpose]"`, e.g., `"example.com
+2019-12-25 16:18:03 session tokens v1"`.
+
+This function is intended for application code written in C. For
+language bindings, see `blake3_hasher_init_derive_key_raw` below.
+
+---
+
+```c
+void blake3_hasher_init_derive_key_raw(
+  blake3_hasher *self,
+  const void *context,
+  size_t context_len);
+```
+
+As `blake3_hasher_init_derive_key` above, except that the context string
+is given as a pointer to an array of arbitrary bytes with a provided
+length. This is intended for writing language bindings, where C string
+conversion would add unnecessary overhead and new error cases. Unicode
+strings should be encoded as UTF-8.
+
+Application code in C should prefer `blake3_hasher_init_derive_key`,
+which takes the context as a C string. If you need to use arbitrary
+bytes as a context string in application code, consider whether you're
+violating the requirement that context strings should be hardcoded.
+
+---
+
+```c
+void blake3_hasher_finalize_seek(
+  const blake3_hasher *self,
+  uint64_t seek,
+  uint8_t *out,
+  size_t out_len);
+```
+
+The same as `blake3_hasher_finalize`, but with an additional `seek`
+parameter for the starting byte position in the output stream. To
+efficiently stream a large output without allocating memory, call this
+function in a loop, incrementing `seek` by the output length each time.
+
+---
+
+```c
+void blake3_hasher_reset(
+  blake3_hasher *self);
+```
+
+Reset the hasher to its initial state, prior to any calls to
+`blake3_hasher_update`. Currently this is no different from calling
+`blake3_hasher_init` or similar again. However, if this implementation gains
+multithreading support in the future, and if `blake3_hasher` holds (optional)
+threading resources, this function will reuse those resources. Until then, this
+is mainly for feature compatibility with the Rust implementation.
+
+
+# Building
+
+This implementation is just C and assembly files. It doesn't include a
+public-facing build system. (The `Makefile` in this directory is only
+for testing.) Instead, the intention is that you can include these files
+in whatever build system you're already using. This section describes
+the commands your build system should execute, or which you can execute
+by hand. Note that these steps may change in future versions.
+
+## x86
+
+Dynamic dispatch is enabled by default on x86. The implementation will
+query the CPU at runtime to detect SIMD support, and it will use the
+widest instruction set available. By default, `blake3_dispatch.c`
+expects to be linked with code for five different instruction sets:
+portable C, SSE2, SSE4.1, AVX2, and AVX-512.
+
+For each of the x86 SIMD instruction sets, four versions are available:
+three flavors of assembly (Unix, Windows MSVC, and Windows GNU) and one
+version using C intrinsics. The assembly versions are generally
+preferred. They perform better, they perform more consistently across
+different compilers, and they build more quickly. On the other hand, the
+assembly versions are x86\_64-only, and you need to select the right
+flavor for your target platform.
+
+Here's an example of building a shared library on x86\_64 Linux using
+the assembly implementations:
+
+```bash
+gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \
+    blake3_sse2_x86-64_unix.S blake3_sse41_x86-64_unix.S blake3_avx2_x86-64_unix.S \
+    blake3_avx512_x86-64_unix.S
+```
+
+When building the intrinsics-based implementations, you need to build
+each implementation separately, with the corresponding instruction set
+explicitly enabled in the compiler. Here's the same shared library using
+the intrinsics-based implementations:
+
+```bash
+gcc -c -fPIC -O3 -msse2 blake3_sse2.c -o blake3_sse2.o
+gcc -c -fPIC -O3 -msse4.1 blake3_sse41.c -o blake3_sse41.o
+gcc -c -fPIC -O3 -mavx2 blake3_avx2.c -o blake3_avx2.o
+gcc -c -fPIC -O3 -mavx512f -mavx512vl blake3_avx512.c -o blake3_avx512.o
+gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c \
+    blake3_avx2.o blake3_avx512.o blake3_sse41.o blake3_sse2.o
+```
+
+Note above that building `blake3_avx512.c` requires both `-mavx512f` and
+`-mavx512vl` under GCC and Clang. Under MSVC, the single `/arch:AVX512`
+flag is sufficient. The MSVC equivalent of `-mavx2` is `/arch:AVX2`.
+MSVC enables SSE2 and SSE4.1 by defaut, and it doesn't have a
+corresponding flag.
+
+If you want to omit SIMD code entirely, you need to explicitly disable
+each instruction set. Here's an example of building a shared library on
+x86 with only portable code:
+
+```bash
+gcc -shared -O3 -o libblake3.so -DBLAKE3_NO_SSE2 -DBLAKE3_NO_SSE41 -DBLAKE3_NO_AVX2 \
+    -DBLAKE3_NO_AVX512 blake3.c blake3_dispatch.c blake3_portable.c
+```
+
+## ARM NEON
+
+The NEON implementation is enabled by default on AArch64, but not on
+other ARM targets, since not all of them support it. To enable it, set
+`BLAKE3_USE_NEON=1`. Here's an example of building a shared library on
+ARM Linux with NEON support:
+
+```bash
+gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON=1 blake3.c blake3_dispatch.c \
+    blake3_portable.c blake3_neon.c
+```
+
+To explicitiy disable using NEON instructions on AArch64, set
+`BLAKE3_USE_NEON=0`.
+
+```bash
+gcc -shared -O3 -o libblake3.so -DBLAKE3_USE_NEON=0 blake3.c blake3_dispatch.c \
+    blake3_portable.c 
+```
+
+Note that on some targets (ARMv7 in particular), extra flags may be
+required to activate NEON support in the compiler. If you see an error
+like...
+
+```
+/usr/lib/gcc/armv7l-unknown-linux-gnueabihf/9.2.0/include/arm_neon.h:635:1: error: inlining failed
+in call to always_inline ‘vaddq_u32’: target specific option mismatch
+```
+
+...then you may need to add something like `-mfpu=neon-vfpv4
+-mfloat-abi=hard`.
+
+## Other Platforms
+
+The portable implementation should work on most other architectures. For
+example:
+
+```bash
+gcc -shared -O3 -o libblake3.so blake3.c blake3_dispatch.c blake3_portable.c
+```
+
+# Multithreading
+
+Unlike the Rust implementation, the C implementation doesn't currently support
+multithreading. A future version of this library could add support by taking an
+optional dependency on OpenMP or similar. Alternatively, we could expose a
+lower-level API to allow callers to implement concurrency themselves. The
+former would be more convenient and less error-prone, but the latter would give
+callers the maximum possible amount of control. The best choice here depends on
+the specific use case, so if you have a use case for multithreaded hashing in
+C, please file a GitHub issue and let us know.
--- a/llvm/lib/Support/BLAKE3/blake3.c
+++ b/llvm/lib/Support/BLAKE3/blake3.c
--- a/llvm/lib/Support/BLAKE3/blake3.h
+++ b/llvm/lib/Support/BLAKE3/blake3.h
+#ifndef BLAKE3_H
+#define BLAKE3_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BLAKE3_VERSION_STRING "1.3.1"
+#define BLAKE3_KEY_LEN 32
+#define BLAKE3_OUT_LEN 32
+#define BLAKE3_BLOCK_LEN 64
+#define BLAKE3_CHUNK_LEN 1024
+#define BLAKE3_MAX_DEPTH 54
+
+// This struct is a private implementation detail. It has to be here because
+// it's part of blake3_hasher below.
+typedef struct {
+  uint32_t cv[8];
+  uint64_t chunk_counter;
+  uint8_t buf[BLAKE3_BLOCK_LEN];
+  uint8_t buf_len;
+  uint8_t blocks_compressed;
+  uint8_t flags;
+} blake3_chunk_state;
+
+typedef struct {
+  uint32_t key[8];
+  blake3_chunk_state chunk;
+  uint8_t cv_stack_len;
+  // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example,
+  // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk
+  // requires a 4th entry, rather than merging everything down to 1, because we
+  // don't know whether more input is coming. This is different from how the
+  // reference implementation does things.
+  uint8_t cv_stack[(BLAKE3_MAX_DEPTH + 1) * BLAKE3_OUT_LEN];
+} blake3_hasher;
+
+const char *blake3_version(void);
+void blake3_hasher_init(blake3_hasher *self);
+void blake3_hasher_init_keyed(blake3_hasher *self,
+                              const uint8_t key[BLAKE3_KEY_LEN]);
+void blake3_hasher_init_derive_key(blake3_hasher *self, const char *context);
+void blake3_hasher_init_derive_key_raw(blake3_hasher *self, const void *context,
+                                       size_t context_len);
+void blake3_hasher_update(blake3_hasher *self, const void *input,
+                          size_t input_len);
+void blake3_hasher_finalize(const blake3_hasher *self, uint8_t *out,
+                            size_t out_len);
+void blake3_hasher_finalize_seek(const blake3_hasher *self, uint64_t seek,
+                                 uint8_t *out, size_t out_len);
+void blake3_hasher_reset(blake3_hasher *self);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* BLAKE3_H */
--- a/llvm/lib/Support/BLAKE3/blake3_avx2.c
+++ b/llvm/lib/Support/BLAKE3/blake3_avx2.c
+#include "blake3_impl.h"
+
+#include <immintrin.h>
+
+#define DEGREE 8
+
+INLINE __m256i loadu(const uint8_t src[32]) {
+  return _mm256_loadu_si256((const __m256i *)src);
+}
+
+INLINE void storeu(__m256i src, uint8_t dest[16]) {
+  _mm256_storeu_si256((__m256i *)dest, src);
+}
+
+INLINE __m256i addv(__m256i a, __m256i b) { return _mm256_add_epi32(a, b); }
+
+// Note that clang-format doesn't like the name "xor" for some reason.
+INLINE __m256i xorv(__m256i a, __m256i b) { return _mm256_xor_si256(a, b); }
+
+INLINE __m256i set1(uint32_t x) { return _mm256_set1_epi32((int32_t)x); }
+
+INLINE __m256i rot16(__m256i x) {
+  return _mm256_shuffle_epi8(
+      x, _mm256_set_epi8(13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2,
+                         13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2));
+}
+
+INLINE __m256i rot12(__m256i x) {
+  return _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12));
+}
+
+INLINE __m256i rot8(__m256i x) {
+  return _mm256_shuffle_epi8(
+      x, _mm256_set_epi8(12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1,
+                         12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1));
+}
+
+INLINE __m256i rot7(__m256i x) {
+  return _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7));
+}
+
+INLINE void round_fn(__m256i v[16], __m256i m[16], size_t r) {
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][0]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][2]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][4]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][6]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[15] = rot16(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot12(v[4]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][1]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][3]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][5]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][7]]);
+  v[0] = addv(v[0], v[4]);
+  v[1] = addv(v[1], v[5]);
+  v[2] = addv(v[2], v[6]);
+  v[3] = addv(v[3], v[7]);
+  v[12] = xorv(v[12], v[0]);
+  v[13] = xorv(v[13], v[1]);
+  v[14] = xorv(v[14], v[2]);
+  v[15] = xorv(v[15], v[3]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[15] = rot8(v[15]);
+  v[8] = addv(v[8], v[12]);
+  v[9] = addv(v[9], v[13]);
+  v[10] = addv(v[10], v[14]);
+  v[11] = addv(v[11], v[15]);
+  v[4] = xorv(v[4], v[8]);
+  v[5] = xorv(v[5], v[9]);
+  v[6] = xorv(v[6], v[10]);
+  v[7] = xorv(v[7], v[11]);
+  v[4] = rot7(v[4]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][8]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][10]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][12]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][14]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot16(v[15]);
+  v[12] = rot16(v[12]);
+  v[13] = rot16(v[13]);
+  v[14] = rot16(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot12(v[5]);
+  v[6] = rot12(v[6]);
+  v[7] = rot12(v[7]);
+  v[4] = rot12(v[4]);
+  v[0] = addv(v[0], m[(size_t)MSG_SCHEDULE[r][9]]);
+  v[1] = addv(v[1], m[(size_t)MSG_SCHEDULE[r][11]]);
+  v[2] = addv(v[2], m[(size_t)MSG_SCHEDULE[r][13]]);
+  v[3] = addv(v[3], m[(size_t)MSG_SCHEDULE[r][15]]);
+  v[0] = addv(v[0], v[5]);
+  v[1] = addv(v[1], v[6]);
+  v[2] = addv(v[2], v[7]);
+  v[3] = addv(v[3], v[4]);
+  v[15] = xorv(v[15], v[0]);
+  v[12] = xorv(v[12], v[1]);
+  v[13] = xorv(v[13], v[2]);
+  v[14] = xorv(v[14], v[3]);
+  v[15] = rot8(v[15]);
+  v[12] = rot8(v[12]);
+  v[13] = rot8(v[13]);
+  v[14] = rot8(v[14]);
+  v[10] = addv(v[10], v[15]);
+  v[11] = addv(v[11], v[12]);
+  v[8] = addv(v[8], v[13]);
+  v[9] = addv(v[9], v[14]);
+  v[5] = xorv(v[5], v[10]);
+  v[6] = xorv(v[6], v[11]);
+  v[7] = xorv(v[7], v[8]);
+  v[4] = xorv(v[4], v[9]);
+  v[5] = rot7(v[5]);
+  v[6] = rot7(v[6]);
+  v[7] = rot7(v[7]);
+  v[4] = rot7(v[4]);
+}
+
+INLINE void transpose_vecs(__m256i vecs[DEGREE]) {
+  // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high
+  // is 22/33/66/77.
+  __m256i ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]);
+  __m256i ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]);
+  __m256i cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]);
+  __m256i cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]);
+  __m256i ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]);
+  __m256i ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]);
+  __m256i gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]);
+  __m256i gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]);
+
+  // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is
+  // 11/33.
+  __m256i abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145);
+  __m256i abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145);
+  __m256i abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367);
+  __m256i abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367);
+  __m256i efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145);
+  __m256i efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145);
+  __m256i efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367);
+  __m256i efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367);
+
+  // Interleave 128-bit lanes.
+  vecs[0] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x20);
+  vecs[1] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x20);
+  vecs[2] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x20);
+  vecs[3] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x20);
+  vecs[4] = _mm256_permute2x128_si256(abcd_04, efgh_04, 0x31);
+  vecs[5] = _mm256_permute2x128_si256(abcd_15, efgh_15, 0x31);
+  vecs[6] = _mm256_permute2x128_si256(abcd_26, efgh_26, 0x31);
+  vecs[7] = _mm256_permute2x128_si256(abcd_37, efgh_37, 0x31);
+}
+
+INLINE void transpose_msg_vecs(const uint8_t *const *inputs,
+                               size_t block_offset, __m256i out[16]) {
+  out[0] = loadu(&inputs[0][block_offset + 0 * sizeof(__m256i)]);
+  out[1] = loadu(&inputs[1][block_offset + 0 * sizeof(__m256i)]);
+  out[2] = loadu(&inputs[2][block_offset + 0 * sizeof(__m256i)]);
+  out[3] = loadu(&inputs[3][block_offset + 0 * sizeof(__m256i)]);
+  out[4] = loadu(&inputs[4][block_offset + 0 * sizeof(__m256i)]);
+  out[5] = loadu(&inputs[5][block_offset + 0 * sizeof(__m256i)]);
+  out[6] = loadu(&inputs[6][block_offset + 0 * sizeof(__m256i)]);
+  out[7] = loadu(&inputs[7][block_offset + 0 * sizeof(__m256i)]);
+  out[8] = loadu(&inputs[0][block_offset + 1 * sizeof(__m256i)]);
+  out[9] = loadu(&inputs[1][block_offset + 1 * sizeof(__m256i)]);
+  out[10] = loadu(&inputs[2][block_offset + 1 * sizeof(__m256i)]);
+  out[11] = loadu(&inputs[3][block_offset + 1 * sizeof(__m256i)]);
+  out[12] = loadu(&inputs[4][block_offset + 1 * sizeof(__m256i)]);
+  out[13] = loadu(&inputs[5][block_offset + 1 * sizeof(__m256i)]);
+  out[14] = loadu(&inputs[6][block_offset + 1 * sizeof(__m256i)]);
+  out[15] = loadu(&inputs[7][block_offset + 1 * sizeof(__m256i)]);
+  for (size_t i = 0; i < 8; ++i) {
+    _mm_prefetch((const void *)&inputs[i][block_offset + 256], _MM_HINT_T0);
+  }
+  transpose_vecs(&out[0]);
+  transpose_vecs(&out[8]);
+}
+
+INLINE void load_counters(uint64_t counter, bool increment_counter,
+                          __m256i *out_lo, __m256i *out_hi) {
+  const __m256i mask = _mm256_set1_epi32(-(int32_t)increment_counter);
+  const __m256i add0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+  const __m256i add1 = _mm256_and_si256(mask, add0);
+  __m256i l = _mm256_add_epi32(_mm256_set1_epi32((int32_t)counter), add1);
+  __m256i carry = _mm256_cmpgt_epi32(_mm256_xor_si256(add1, _mm256_set1_epi32(0x80000000)), 
+                                     _mm256_xor_si256(   l, _mm256_set1_epi32(0x80000000)));
+  __m256i h = _mm256_sub_epi32(_mm256_set1_epi32((int32_t)(counter >> 32)), carry);
+  *out_lo = l;
+  *out_hi = h;
+}
+
+static
+void blake3_hash8_avx2(const uint8_t *const *inputs, size_t blocks,
+                       const uint32_t key[8], uint64_t counter,
+                       bool increment_counter, uint8_t flags,
+                       uint8_t flags_start, uint8_t flags_end, uint8_t *out) {
+  __m256i h_vecs[8] = {
+      set1(key[0]), set1(key[1]), set1(key[2]), set1(key[3]),
+      set1(key[4]), set1(key[5]), set1(key[6]), set1(key[7]),
+  };
+  __m256i counter_low_vec, counter_high_vec;
+  load_counters(counter, increment_counter, &counter_low_vec,
+                &counter_high_vec);
+  uint8_t block_flags = flags | flags_start;
+
+  for (size_t block = 0; block < blocks; block++) {
+    if (block + 1 == blocks) {
+      block_flags |= flags_end;
+    }
+    __m256i block_len_vec = set1(BLAKE3_BLOCK_LEN);
+    __m256i block_flags_vec = set1(block_flags);
+    __m256i msg_vecs[16];
+    transpose_msg_vecs(inputs, block * BLAKE3_BLOCK_LEN, msg_vecs);
+
+    __m256i v[16] = {
+        h_vecs[0],       h_vecs[1],        h_vecs[2],     h_vecs[3],
+        h_vecs[4],       h_vecs[5],        h_vecs[6],     h_vecs[7],
+        set1(IV[0]),     set1(IV[1]),      set1(IV[2]),   set1(IV[3]),
+        counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
+    };
+    round_fn(v, msg_vecs, 0);
+    round_fn(v, msg_vecs, 1);
+    round_fn(v, msg_vecs, 2);
+    round_fn(v, msg_vecs, 3);
+    round_fn(v, msg_vecs, 4);
+    round_fn(v, msg_vecs, 5);
+    round_fn(v, msg_vecs, 6);
+    h_vecs[0] = xorv(v[0], v[8]);
+    h_vecs[1] = xorv(v[1], v[9]);
+    h_vecs[2] = xorv(v[2], v[10]);
+    h_vecs[3] = xorv(v[3], v[11]);
+    h_vecs[4] = xorv(v[4], v[12]);
+    h_vecs[5] = xorv(v[5], v[13]);
+    h_vecs[6] = xorv(v[6], v[14]);
+    h_vecs[7] = xorv(v[7], v[15]);
+
+    block_flags = flags;
+  }
+
+  transpose_vecs(h_vecs);
+  storeu(h_vecs[0], &out[0 * sizeof(__m256i)]);
+  storeu(h_vecs[1], &out[1 * sizeof(__m256i)]);
+  storeu(h_vecs[2], &out[2 * sizeof(__m256i)]);
+  storeu(h_vecs[3], &out[3 * sizeof(__m256i)]);
+  storeu(h_vecs[4], &out[4 * sizeof(__m256i)]);
+  storeu(h_vecs[5], &out[5 * sizeof(__m256i)]);
+  storeu(h_vecs[6], &out[6 * sizeof(__m256i)]);
+  storeu(h_vecs[7], &out[7 * sizeof(__m256i)]);
+}
+
+#if !defined(BLAKE3_NO_SSE41)
+void blake3_hash_many_sse41(const uint8_t *const *inputs, size_t num_inputs,
+                            size_t blocks, const uint32_t key[8],
+                            uint64_t counter, bool increment_counter,
+                            uint8_t flags, uint8_t flags_start,
+                            uint8_t flags_end, uint8_t *out);
+#else
+void blake3_hash_many_portable(const uint8_t *const *inputs, size_t num_inputs,
+                               size_t blocks, const uint32_t key[8],
+                               uint64_t counter, bool increment_counter,
+                               uint8_t flags, uint8_t flags_start,
+                               uint8_t flags_end, uint8_t *out);
+#endif
+
+void blake3_hash_many_avx2(const uint8_t *const *inputs, size_t num_inputs,
+                           size_t blocks, const uint32_t key[8],
+                           uint64_t counter, bool increment_counter,
+                           uint8_t flags, uint8_t flags_start,
+                           uint8_t flags_end, uint8_t *out) {
+  while (num_inputs >= DEGREE) {
+    blake3_hash8_avx2(inputs, blocks, key, counter, increment_counter, flags,
+                      flags_start, flags_end, out);
+    if (increment_counter) {
+      counter += DEGREE;
+    }
+    inputs += DEGREE;
+    num_inputs -= DEGREE;
+    out = &out[DEGREE * BLAKE3_OUT_LEN];
+  }
+#if !defined(BLAKE3_NO_SSE41)
+  blake3_hash_many_sse41(inputs, num_inputs, blocks, key, counter,
+                         increment_counter, flags, flags_start, flags_end, out);
+#else
+  blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
+                            increment_counter, flags, flags_start, flags_end,
+                            out);
+#endif
+}
--- a/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S
+++ b/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S
--- a/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_gnu.S
+++ b/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_gnu.S
--- a/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_msvc.asm
+++ b/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_windows_msvc.asm
--- a/llvm/lib/Support/BLAKE3/blake3_avx512.c
+++ b/llvm/lib/Support/BLAKE3/blake3_avx512.c
--- a/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S
+++ b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S
--- a/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_windows_gnu.S
+++ b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_windows_gnu.S
--- a/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_windows_msvc.asm
+++ b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_windows_msvc.asm
--- a/llvm/lib/Support/BLAKE3/blake3_dispatch.c
+++ b/llvm/lib/Support/BLAKE3/blake3_dispatch.c
--- a/llvm/lib/Support/BLAKE3/blake3_impl.h
+++ b/llvm/lib/Support/BLAKE3/blake3_impl.h
--- a/llvm/lib/Support/BLAKE3/blake3_neon.c
+++ b/llvm/lib/Support/BLAKE3/blake3_neon.c
--- a/llvm/lib/Support/BLAKE3/blake3_portable.c
+++ b/llvm/lib/Support/BLAKE3/blake3_portable.c