y_all_workaround/README.md
+@@ -0,0 +1,13 @@
++# any_all_workaround
++
++This is a workaround for bad codegen ([Rust bug](https://github.com/rust-lang/portable-simd/issues/146), [LLVM bug](https://github.com/llvm/llvm-project/issues/50466)) for the `any()` and `all()` reductions for NEON-backed SIMD vectors on 32-bit ARM. On other platforms these delegate to `any()` and `all()` in `core::simd`.
++
++The plan is to abandon this crate once the LLVM bug is fixed or `core::simd` works around the LLVM bug.
++
++The code is forked from the [`packed_simd` crate](https://raw.githubusercontent.com/hsivonen/packed_simd/d938e39bee9bc5c222f5f2f2a0df9e53b5ce36ae/src/codegen/reductions/mask/arm.rs).
++
++This crate requires Nightly Rust as it depends on the `portable_simd` feature.
++
++# License
++
++`MIT OR Apache-2.0`, since that's how `packed_simd` is licensed. (The ARM intrinsics Rust version workaround is from qcms, see LICENSE-MIT-QCMS.)
+diff --git a/third_party/rust/any_all_workaround/build.rs b/third_party/rust/any_all_workaround/build.rs
+new file mode 100644
+--- /dev/null
++++ third_party/rust/any_all_workaround/build.rs
+@@ -0,0 +1,7 @@
++extern crate version_check as rustc;
++
++fn main() {
++    if rustc::is_min_version("1.78.0").unwrap_or(false) {
++        println!("cargo:rustc-cfg=stdsimd_split");
++    }
++}
+diff --git a/third_party/rust/any_all_workaround/src/lib.rs b/third_party/rust/any_all_workaround/src/lib.rs
+new file mode 100644
+--- /dev/null
++++ third_party/rust/any_all_workaround/src/lib.rs
+@@ -0,0 +1,110 @@
++// This code began as a fork of
++// https://raw.githubusercontent.com/rust-lang/packed_simd/d938e39bee9bc5c222f5f2f2a0df9e53b5ce36ae/src/codegen/reductions/mask/arm.rs
++// which didn't have a license header on the file, but Cargo.toml said "MIT OR Apache-2.0".
++// See LICENSE-MIT and LICENSE-APACHE.
++
++#![no_std]
++#![feature(portable_simd)]
++#![cfg_attr(
++    all(
++        stdsimd_split,
++        target_arch = "arm",
++        target_endian = "little",
++        target_feature = "neon",
++        target_feature = "v7"
++    ),
++    feature(stdarch_arm_neon_intrinsics)
++)]
++#![cfg_attr(
++    all(
++        not(stdsimd_split),
++        target_arch = "arm",
++        target_endian = "little",
++        target_feature = "neon",
++        target_feature = "v7"
++    ),
++    feature(stdsimd)
++)]
++
++use cfg_if::cfg_if;
++use core::simd::mask16x8;
++use core::simd::mask32x4;
++use core::simd::mask8x16;
++
++cfg_if! {
++    if #[cfg(all(target_arch = "arm", target_endian = "little", target_feature = "neon", target_feature = "v7"))] {
++        use core::simd::mask8x8;
++        use core::simd::mask16x4;
++        use core::simd::mask32x2;
++        macro_rules! arm_128_v7_neon_impl {
++            ($all:ident, $any:ident, $id:ident, $half:ident, $vpmin:ident, $vpmax:ident) => {
++                #[inline]
++                pub fn $all(s: $id) -> bool {
++                    use core::arch::arm::$vpmin;
++                    use core::mem::transmute;
++                    unsafe {
++                        union U {
++                            halves: ($half, $half),
++                            vec: $id,
++                        }
++                        let halves = U { vec: s }.halves;
++                        let h: $half = transmute($vpmin(transmute(halves.0), transmute(halves.1)));
++                        h.all()
++                    }
++                }
++                #[inline]
++                pub fn $any(s: $id) -> bool {
++                    use core::arch::arm::$vpmax;
++                    use core::mem::transmute;
++                    unsafe {
++                        union U {
++                            halves: ($half, $half),
++                            vec: $id,
++                        }
++                        let halves = U { vec: s }.halves;
++                        let h: $half = transmute($vpmax(transmute(halves.0), transmute(halves.1)));
++                        h.any()
++                    }
++                }
++            }
++        }
++    } else {
++        macro_rules! arm_128_v7_neon_impl {
++            ($all:ident, $any:ident, $id:ident, $half:ident, $vpmin:ident, $vpmax:ident) => {
++                #[inline(always)]
++                pub fn $all(s: $id) -> bool {
++                    s.all()
++                }
++                #[inline(always)]
++                pub fn $any(s: $id) -> bool {
++                    s.any()
++                }
++            }
++        }
++    }
++}
++
++arm_128_v7_neon_impl!(
++    all_mask8x16,
++    any_mask8x16,
++    mask8x16,
++    mask8x8,
++    vpmin_u8,
++    vpmax_u8
++);
++arm_128_v7_neon_impl!(
++    all_mask16x8,
++    any_mask16x8,
++    mask16x8,
++    mask16x4,
++    vpmin_u16,
++    vpmax_u16
++);
++arm_128_v7_neon_impl!(
++    all_mask32x4,
++    any_mask32x4,
++    mask32x4,
++    mask32x2,
++    vpmin_u32,
++    vpmax_u32
++);
+diff --git a/third_party/rust/encoding_rs/Cargo.toml b/third_party/rust/encoding_rs/Cargo.toml
+--- third_party/rust/encoding_rs/Cargo.toml
++++ third_party/rust/encoding_rs/Cargo.toml
+@@ -6,18 +6,19 @@
+ # to registry (e.g., crates.io) dependencies.
+ #
+ # If you are reading this file be aware that the original Cargo.toml
+ # will likely look very different (and much more reasonable).
+ # See Cargo.toml.orig for the original contents.
+ 
+ [package]
+ edition = "2018"
++rust-version = "1.36"
+ name = "encoding_rs"
+-version = "0.8.33"
++version = "0.8.34"
+ authors = ["Henri Sivonen <hsivonen@hsivonen.fi>"]
+ description = "A Gecko-oriented implementation of the Encoding Standard"
+ homepage = "https://docs.rs/encoding_rs/"
+ documentation = "https://docs.rs/encoding_rs/"
+ readme = "README.md"
+ keywords = [
+     "encoding",
+     "web",
+@@ -31,23 +32,23 @@ categories = [
+     "internationalization",
+ ]
+ license = "(Apache-2.0 OR MIT) AND BSD-3-Clause"
+ repository = "https://github.com/hsivonen/encoding_rs"
+ 
+ [profile.release]
+ lto = true
+ 
++[dependencies.any_all_workaround]
++version = "0.1.0"
++optional = true
++
+ [dependencies.cfg-if]
+ version = "1.0"
+ 
+-[dependencies.packed_simd]
+-version = "0.3.9"
+-optional = true
+-
+ [dependencies.serde]
+ version = "1.0"
+ optional = true
+ 
+ [dev-dependencies.bincode]
+ version = "1.0"
+ 
+ [dev-dependencies.serde_derive]
+@@ -69,15 +70,9 @@ fast-legacy-encode = [
+     "fast-hanja-encode",
+     "fast-kanji-encode",
+     "fast-gb-hanzi-encode",
+     "fast-big5-hanzi-encode",
+ ]
+ less-slow-big5-hanzi-encode = []
+ less-slow-gb-hanzi-encode = []
+ less-slow-kanji-encode = []
+-simd-accel = [
+-    "packed_simd",
+-    "packed_simd/into_bits",
+-]
+-
+-[badges.travis-ci]
+-repository = "hsivonen/encoding_rs"
++simd-accel = ["any_all_workaround"]
+diff --git a/third_party/rust/encoding_rs/README.md b/third_party/rust/encoding_rs/README.md
+--- third_party/rust/encoding_rs/README.md
++++ third_party/rust/encoding_rs/README.md
+@@ -162,50 +162,36 @@ wrappers.
+ * [C++](https://github.com/hsivonen/recode_cpp)
+ 
+ ## Optional features
+ 
+ There are currently these optional cargo features:
+ 
+ ### `simd-accel`
+ 
+-Enables SIMD acceleration using the nightly-dependent `packed_simd` crate.
++Enables SIMD acceleration using the nightly-dependent `portable_simd` standard
++library feature.
+ 
+ This is an opt-in feature, because enabling this feature _opts out_ of Rust's
+ guarantees of future compilers compiling old code (aka. "stability story").
+ 
+ Currently, this has not been tested to be an improvement except for these
+-targets:
++targets and enabling the `simd-accel` feature is expected to break the build
++on other targets:
+ 
+ * x86_64
+ * i686
+ * aarch64
+ * thumbv7neon
+ 
+ If you use nightly Rust, you use targets whose first component is one of the
+ above, and you are prepared _to have to revise your configuration when updating
+ Rust_, you should enable this feature. Otherwise, please _do not_ enable this
+ feature.
+ 
+-_Note!_ If you are compiling for a target that does not have 128-bit SIMD
+-enabled as part of the target definition and you are enabling 128-bit SIMD
+-using `-C target_feature`, you need to enable the `core_arch` Cargo feature
+-for `packed_simd` to compile a crates.io snapshot of `core_arch` instead of
+-using the standard-library copy of `core::arch`, because the `core::arch`
+-module of the pre-compiled standard library has been compiled with the
+-assumption that the CPU doesn't have 128-bit SIMD. At present this applies
+-mainly to 32-bit ARM targets whose first component does not include the
+-substring `neon`.
+-
+-The encoding_rs side of things has not been properly set up for POWER,
+-PowerPC, MIPS, etc., SIMD at this time, so even if you were to follow
+-the advice from the previous paragraph, you probably shouldn't use
+-the `simd-accel` option on the less mainstream architectures at this
+-time.
+-
+ Used by Firefox.
+ 
+ ### `serde`
+ 
+ Enables support for serializing and deserializing `&'static Encoding`-typed
+ struct fields using [Serde][1].
+ 
+ [1]: https://serde.rs/
+@@ -376,18 +362,19 @@ It is a goal to support the latest stabl
+ the version of Rust that's used for Firefox Nightly.
+ 
+ At this time, there is no firm commitment to support a version older than
+ what's required by Firefox, and there is no commitment to treat MSRV changes
+ as semver-breaking, because this crate depends on `cfg-if`, which doesn't
+ appear to treat MSRV changes as semver-breaking, so it would be useless for
+ this crate to treat MSRV changes as semver-breaking.
+ 
+-As of 2021-02-04, MSRV appears to be Rust 1.36.0 for using the crate and
++As of 2024-04-04, MSRV appears to be Rust 1.36.0 for using the crate and
+ 1.42.0 for doc tests to pass without errors about the global allocator.
++With the `simd-accel` feature, the MSRV is even higher.
+ 
+ ## Compatibility with rust-encoding
+ 
+ A compatibility layer that implements the rust-encoding API on top of
+ encoding_rs is
+ [provided as a separate crate](https://github.com/hsivonen/encoding_rs_compat)
+ (cannot be uploaded to crates.io). The compatibility layer was originally
+ written with the assuption that Firefox would need it, but it is not currently
+@@ -441,20 +428,27 @@ To regenerate the generated code:
+ - [x] Implement the rust-encoding API in terms of encoding_rs.
+ - [x] Add SIMD acceleration for Aarch64.
+ - [x] Investigate the use of NEON on 32-bit ARM.
+ - [ ] ~Investigate Björn Höhrmann's lookup table acceleration for UTF-8 as
+       adapted to Rust in rust-encoding.~
+ - [x] Add actually fast CJK encode options.
+ - [ ] ~Investigate [Bob Steagall's lookup table acceleration for UTF-8](https://github.com/BobSteagall/CppNow2018/blob/master/FastConversionFromUTF-8/Fast%20Conversion%20From%20UTF-8%20with%20C%2B%2B%2C%20DFAs%2C%20and%20SSE%20Intrinsics%20-%20Bob%20Steagall%20-%20C%2B%2BNow%202018.pdf).~
+ - [x] Provide a build mode that works without `alloc` (with lesser API surface).
+-- [ ] Migrate to `std::simd` once it is stable and declare 1.0.
++- [x] Migrate to `std::simd` ~once it is stable and declare 1.0.~
++- [ ] Migrate `unsafe` slice access by larger types than `u8`/`u16` to `align_to`.
+ 
+ ## Release Notes
+ 
++### 0.8.34
++
++* Use the `portable_simd` nightly feature of the standard library instead of the `packed_simd` crate. Only affects the `simd-accel` optional nightly feature.
++* Internal documentation improvements and minor code improvements around `unsafe`.
++* Added `rust-version` to `Cargo.toml`.
++
+ ### 0.8.33
+ 
+ * Use `packed_simd` instead of `packed_simd_2` again now that updates are back under the `packed_simd` name. Only affects the `simd-accel` optional nightly feature.
+ 
+ ### 0.8.32
+ 
+ * Removed `build.rs`. (This removal should resolve false positives reported by some antivirus products. This may break some build configurations that have opted out of Rust's guarantees against future build breakage.)
+ * Internal change to what API is used for reinterpreting the lane configuration of SIMD vectors.
+diff --git a/third_party/rust/encoding_rs/src/ascii.rs b/third_party/rust/encoding_rs/src/ascii.rs
+--- third_party/rust/encoding_rs/src/ascii.rs
++++ third_party/rust/encoding_rs/src/ascii.rs
+@@ -46,71 +46,87 @@ cfg_if! {
+         #[allow(dead_code)]
+         #[inline(always)]
+         fn likely(b: bool) -> bool {
+             b
+         }
+     }
+ }
+ 
++// Safety invariants for masks: data & mask = 0 for valid ASCII or basic latin utf-16
++
+ // `as` truncates, so works on 32-bit, too.
+ #[allow(dead_code)]
+ pub const ASCII_MASK: usize = 0x8080_8080_8080_8080u64 as usize;
+ 
+ // `as` truncates, so works on 32-bit, too.
+ #[allow(dead_code)]
+ pub const BASIC_LATIN_MASK: usize = 0xFF80_FF80_FF80_FF80u64 as usize;
+ 
+ #[allow(unused_macros)]
+ macro_rules! ascii_naive {
+     ($name:ident, $src_unit:ty, $dst_unit:ty) => {
++        /// Safety: src and dst must have len_unit elements and be aligned
++        /// Safety-usable invariant: will return Some() when it fails
++        /// to convert. The first value will be a u8 that is > 127.
+         #[inline(always)]
+         pub unsafe fn $name(
+             src: *const $src_unit,
+             dst: *mut $dst_unit,
+             len: usize,
+         ) -> Option<($src_unit, usize)> {
+             // Yes, manually omitting the bound check here matters
+             // a lot for perf.
+             for i in 0..len {
++                // Safety: len invariant used here
+                 let code_unit = *(src.add(i));
++                // Safety: Upholds safety-usable invariant here
+                 if code_unit > 127 {
+                     return Some((code_unit, i));
+                 }
++                // Safety: len invariant used here
+                 *(dst.add(i)) = code_unit as $dst_unit;
+             }
+             return None;
+         }
+     };
+ }
+ 
+ #[allow(unused_macros)]
+ macro_rules! ascii_alu {
+     ($name:ident,
++     // safety invariant: src/dst MUST be u8
+      $src_unit:ty,
+      $dst_unit:ty,
++     // Safety invariant: stride_fn must consume and produce two usizes, and return the index of the first non-ascii when it fails
+      $stride_fn:ident) => {
++        /// Safety: src and dst must have len elements, src is valid for read, dst is valid for
++        /// write
++        /// Safety-usable invariant: will return Some() when it fails
++        /// to convert. The first value will be a u8 that is > 127.
+         #[cfg_attr(feature = "cargo-clippy", allow(never_loop, cast_ptr_alignment))]
+         #[inline(always)]
+         pub unsafe fn $name(
+             src: *const $src_unit,
+             dst: *mut $dst_unit,
+             len: usize,
+         ) -> Option<($src_unit, usize)> {
+             let mut offset = 0usize;
+             // This loop is only broken out of as a `goto` forward
+             loop {
++                // Safety: until_alignment becomes the number of bytes we need to munch until we are aligned to usize
+                 let mut until_alignment = {
+                     // Check if the other unit aligns if we move the narrower unit
+                     // to alignment.
+                     //               if ::core::mem::size_of::<$src_unit>() == ::core::mem::size_of::<$dst_unit>() {
+                     // ascii_to_ascii
+                     let src_alignment = (src as usize) & ALU_ALIGNMENT_MASK;
+                     let dst_alignment = (dst as usize) & ALU_ALIGNMENT_MASK;
+                     if src_alignment != dst_alignment {
++                        // Safety: bails early and ends up in the naïve branch where usize-alignment doesn't matter
+                         break;
+                     }
+                     (ALU_ALIGNMENT - src_alignment) & ALU_ALIGNMENT_MASK
+                     //               } else if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() {
+                     // ascii_to_basic_latin
+                     //                   let src_until_alignment = (ALIGNMENT - ((src as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
+                     //                   if (dst.add(src_until_alignment) as usize) & ALIGNMENT_MASK != 0 {
+                     //                       break;
+@@ -129,74 +145,104 @@ macro_rules! ascii_alu {
+                     // Moving pointers to alignment seems to be a pessimization on
+                     // x86_64 for operations that have UTF-16 as the internal
+                     // Unicode representation. However, since it seems to be a win
+                     // on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except
+                     // mixed results when encoding from UTF-16 and since x86 and
+                     // x86_64 should be using SSE2 in due course, keeping the move
+                     // to alignment here. It would be good to test on more ARM CPUs
+                     // and on real MIPS and POWER hardware.
++                    //
++                    // Safety: This is the naïve code once again, for `until_alignment` bytes
+                     while until_alignment != 0 {
+                         let code_unit = *(src.add(offset));
+                         if code_unit > 127 {
++                            // Safety: Upholds safety-usable invariant here
+                             return Some((code_unit, offset));
+                         }
+                         *(dst.add(offset)) = code_unit as $dst_unit;
++                        // Safety: offset is the number of bytes copied so far
+                         offset += 1;
+                         until_alignment -= 1;
+                     }
+                     let len_minus_stride = len - ALU_STRIDE_SIZE;
+                     loop {
++                        // Safety: num_ascii is known to be a byte index of a non-ascii byte due to stride_fn's invariant
+                         if let Some(num_ascii) = $stride_fn(
++                            // Safety: These are known to be valid and aligned since we have at
++                            // least ALU_STRIDE_SIZE data in these buffers, and offset is the
++                            // number of elements copied so far, which according to the
++                            // until_alignment calculation above will cause both src and dst to be
++                            // aligned to usize after this add
+                             src.add(offset) as *const usize,
+                             dst.add(offset) as *mut usize,
+                         ) {
+                             offset += num_ascii;
++                            // Safety: Upholds safety-usable invariant here by indexing into non-ascii byte
+                             return Some((*(src.add(offset)), offset));
+                         }
++                        // Safety: offset continues to be the number of bytes copied so far, and
++                        // maintains usize alignment for the next loop iteration
+                         offset += ALU_STRIDE_SIZE;
++                        // Safety: This is `offset > len - stride. This loop will continue as long as
++                        // `offset <= len - stride`, which means there are `stride` bytes to still be read.
+                         if offset > len_minus_stride {
+                             break;
+                         }
+                     }
+                 }
+                 break;
+             }
++
++            // Safety: This is the naïve code, same as ascii_naive, and has no requirements
++            // other than src/dst being valid for the the right lens
+             while offset < len {
++                // Safety: len invariant used here
+                 let code_unit = *(src.add(offset));
+                 if code_unit > 127 {
++                    // Safety: Upholds safety-usable invariant here
+                     return Some((code_unit, offset));
+                 }
++                // Safety: len invariant used here
+                 *(dst.add(offset)) = code_unit as $dst_unit;
+                 offset += 1;
+             }
+             None
+         }
+     };
+ }
+ 
+ #[allow(unused_macros)]
+ macro_rules! basic_latin_alu {
+     ($name:ident,
++    // safety invariant: use u8 for src/dest for ascii, and u16 for basic_latin
+      $src_unit:ty,
+      $dst_unit:ty,
++    // safety invariant: stride function must munch ALU_STRIDE_SIZE*size(src_unit) bytes off of src and
++    // write ALU_STRIDE_SIZE*size(dst_unit) bytes to dst
+      $stride_fn:ident) => {
++        /// Safety: src and dst must have len elements, src is valid for read, dst is valid for
++        /// write
++        /// Safety-usable invariant: will return Some() when it fails
++        /// to convert. The first value will be a u8 that is > 127.
+         #[cfg_attr(
+             feature = "cargo-clippy",
+             allow(never_loop, cast_ptr_alignment, cast_lossless)
+         )]
+         #[inline(always)]
+         pub unsafe fn $name(
+             src: *const $src_unit,
+             dst: *mut $dst_unit,
+             len: usize,
+         ) -> Option<($src_unit, usize)> {
+             let mut offset = 0usize;
+             // This loop is only broken out of as a `goto` forward
+             loop {
++                // Safety: until_alignment becomes the number of bytes we need to munch from src/dest until we are aligned to usize
++                // We ensure basic-latin has the same alignment as ascii, starting with ascii since it is smaller.
+                 let mut until_alignment = {
+                     // Check if the other unit aligns if we move the narrower unit
+                     // to alignment.
+                     //               if ::core::mem::size_of::<$src_unit>() == ::core::mem::size_of::<$dst_unit>() {
+                     // ascii_to_ascii
+                     //                   let src_alignment = (src as usize) & ALIGNMENT_MASK;
+                     //                   let dst_alignment = (dst as usize) & ALIGNMENT_MASK;
+                     //                   if src_alignment != dst_alignment {
+@@ -232,66 +278,89 @@ macro_rules! basic_latin_alu {
+                     // Moving pointers to alignment seems to be a pessimization on
+                     // x86_64 for operations that have UTF-16 as the internal
+                     // Unicode representation. However, since it seems to be a win
+                     // on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except
+                     // mixed results when encoding from UTF-16 and since x86 and
+                     // x86_64 should be using SSE2 in due course, keeping the move
+                     // to alignment here. It would be good to test on more ARM CPUs
+                     // and on real MIPS and POWER hardware.
++                    //
++                    // Safety: This is the naïve code once again, for `until_alignment` bytes
+                     while until_alignment != 0 {
+                         let code_unit = *(src.add(offset));
+                         if code_unit > 127 {
++                            // Safety: Upholds safety-usable invariant here
+                             return Some((code_unit, offset));
+                         }
+                         *(dst.add(offset)) = code_unit as $dst_unit;
++                        // Safety: offset is the number of bytes copied so far
+                         offset += 1;
+                         until_alignment -= 1;
+                     }
+                     let len_minus_stride = len - ALU_STRIDE_SIZE;
+                     loop {
+                         if !$stride_fn(
++                            // Safety: These are known to be valid and aligned since we have at
++                            // least ALU_STRIDE_SIZE data in these buffers, and offset is the
++                            // number of elements copied so far, which according to the
++                            // until_alignment calculation above will cause both src and dst to be
++                            // aligned to usize after this add
+                             src.add(offset) as *const usize,
+                             dst.add(offset) as *mut usize,
+                         ) {
+                             break;
+                         }
++                        // Safety: offset continues to be the number of bytes copied so far, and
++                        // maintains usize alignment for the next loop iteration
+                         offset += ALU_STRIDE_SIZE;
++                        // Safety: This is `offset > len - stride. This loop will continue as long as
++                        // `offset <= len - stride`, which means there are `stride` bytes to still be read.
+                         if offset > len_minus_stride {
+                             break;
+                         }
+                     }
+                 }
+                 break;
+             }
++            // Safety: This is the naïve code once again, for leftover bytes
+             while offset < len {
++                // Safety: len invariant used here
+                 let code_unit = *(src.add(offset));
+                 if code_unit > 127 {
++                    // Safety: Upholds safety-usable invariant here
+                     return Some((code_unit, offset));
+                 }
++                // Safety: len invariant used here
+                 *(dst.add(offset)) = code_unit as $dst_unit;
+                 offset += 1;
+             }
+             None
+         }
+     };
+ }
+ 
+ #[allow(unused_macros)]
+ macro_rules! latin1_alu {
++    // safety invariant: stride function must munch ALU_STRIDE_SIZE*size(src_unit) bytes off of src and
++    // write ALU_STRIDE_SIZE*size(dst_unit) bytes to dst
+     ($name:ident, $src_unit:ty, $dst_unit:ty, $stride_fn:ident) => {
++        /// Safety: src and dst must have len elements, src is valid for read, dst is valid for
++        /// write
+         #[cfg_attr(
+             feature = "cargo-clippy",
+             allow(never_loop, cast_ptr_alignment, cast_lossless)
+         )]
+         #[inline(always)]
+         pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
+             let mut offset = 0usize;
+             // This loop is only broken out of as a `goto` forward
+             loop {
++                // Safety: until_alignment becomes the number of bytes we need to munch from src/dest until we are aligned to usize
++                // We ensure the UTF-16 side has the same alignment as the Latin-1 side, starting with Latin-1 since it is smaller.
+                 let mut until_alignment = {
+                     if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() {
+                         // unpack
+                         let src_until_alignment = (ALU_ALIGNMENT
+                             - ((src as usize) & ALU_ALIGNMENT_MASK))
+                             & ALU_ALIGNMENT_MASK;
+                         if (dst.wrapping_add(src_until_alignment) as usize) & ALU_ALIGNMENT_MASK
+                             != 0
+@@ -308,373 +377,485 @@ macro_rules! latin1_alu {
+                             != 0
+                         {
+                             break;
+                         }
+                         dst_until_alignment
+                     }
+                 };
+                 if until_alignment + ALU_STRIDE_SIZE <= len {
++                    // Safety: This is the naïve code once again, for `until_alignment` bytes
+                     while until_alignment != 0 {
+                         let code_unit = *(src.add(offset));
+                         *(dst.add(offset)) = code_unit as $dst_unit;
++                        // Safety: offset is the number of bytes copied so far
+                         offset += 1;
*** 2691 LINES SKIPPED ***