y_all_workaround/README.md +@@ -0,0 +1,13 @@ ++# any_all_workaround ++ ++This is a workaround for bad codegen ([Rust bug](https://github.com/rust-lang/portable-simd/issues/146), [LLVM bug](https://github.com/llvm/llvm-project/issues/50466)) for the `any()` and `all()` reductions for NEON-backed SIMD vectors on 32-bit ARM. On other platforms these delegate to `any()` and `all()` in `core::simd`. ++ ++The plan is to abandon this crate once the LLVM bug is fixed or `core::simd` works around the LLVM bug. ++ ++The code is forked from the [`packed_simd` crate](https://raw.githubusercontent.com/hsivonen/packed_simd/d938e39bee9bc5c222f5f2f2a0df9e53b5ce36ae/src/codegen/reductions/mask/arm.rs). ++ ++This crate requires Nightly Rust as it depends on the `portable_simd` feature. ++ ++# License ++ ++`MIT OR Apache-2.0`, since that's how `packed_simd` is licensed. (The ARM intrinsics Rust version workaround is from qcms, see LICENSE-MIT-QCMS.) +diff --git a/third_party/rust/any_all_workaround/build.rs b/third_party/rust/any_all_workaround/build.rs +new file mode 100644 +--- /dev/null ++++ third_party/rust/any_all_workaround/build.rs +@@ -0,0 +1,7 @@ ++extern crate version_check as rustc; ++ ++fn main() { ++ if rustc::is_min_version("1.78.0").unwrap_or(false) { ++ println!("cargo:rustc-cfg=stdsimd_split"); ++ } ++} +diff --git a/third_party/rust/any_all_workaround/src/lib.rs b/third_party/rust/any_all_workaround/src/lib.rs +new file mode 100644 +--- /dev/null ++++ third_party/rust/any_all_workaround/src/lib.rs +@@ -0,0 +1,110 @@ ++// This code began as a fork of ++// https://raw.githubusercontent.com/rust-lang/packed_simd/d938e39bee9bc5c222f5f2f2a0df9e53b5ce36ae/src/codegen/reductions/mask/arm.rs ++// which didn't have a license header on the file, but Cargo.toml said "MIT OR Apache-2.0". ++// See LICENSE-MIT and LICENSE-APACHE. ++ ++#![no_std] ++#![feature(portable_simd)] ++#![cfg_attr( ++ all( ++ stdsimd_split, ++ target_arch = "arm", ++ target_endian = "little", ++ target_feature = "neon", ++ target_feature = "v7" ++ ), ++ feature(stdarch_arm_neon_intrinsics) ++)] ++#![cfg_attr( ++ all( ++ not(stdsimd_split), ++ target_arch = "arm", ++ target_endian = "little", ++ target_feature = "neon", ++ target_feature = "v7" ++ ), ++ feature(stdsimd) ++)] ++ ++use cfg_if::cfg_if; ++use core::simd::mask16x8; ++use core::simd::mask32x4; ++use core::simd::mask8x16; ++ ++cfg_if! { ++ if #[cfg(all(target_arch = "arm", target_endian = "little", target_feature = "neon", target_feature = "v7"))] { ++ use core::simd::mask8x8; ++ use core::simd::mask16x4; ++ use core::simd::mask32x2; ++ macro_rules! arm_128_v7_neon_impl { ++ ($all:ident, $any:ident, $id:ident, $half:ident, $vpmin:ident, $vpmax:ident) => { ++ #[inline] ++ pub fn $all(s: $id) -> bool { ++ use core::arch::arm::$vpmin; ++ use core::mem::transmute; ++ unsafe { ++ union U { ++ halves: ($half, $half), ++ vec: $id, ++ } ++ let halves = U { vec: s }.halves; ++ let h: $half = transmute($vpmin(transmute(halves.0), transmute(halves.1))); ++ h.all() ++ } ++ } ++ #[inline] ++ pub fn $any(s: $id) -> bool { ++ use core::arch::arm::$vpmax; ++ use core::mem::transmute; ++ unsafe { ++ union U { ++ halves: ($half, $half), ++ vec: $id, ++ } ++ let halves = U { vec: s }.halves; ++ let h: $half = transmute($vpmax(transmute(halves.0), transmute(halves.1))); ++ h.any() ++ } ++ } ++ } ++ } ++ } else { ++ macro_rules! arm_128_v7_neon_impl { ++ ($all:ident, $any:ident, $id:ident, $half:ident, $vpmin:ident, $vpmax:ident) => { ++ #[inline(always)] ++ pub fn $all(s: $id) -> bool { ++ s.all() ++ } ++ #[inline(always)] ++ pub fn $any(s: $id) -> bool { ++ s.any() ++ } ++ } ++ } ++ } ++} ++ ++arm_128_v7_neon_impl!( ++ all_mask8x16, ++ any_mask8x16, ++ mask8x16, ++ mask8x8, ++ vpmin_u8, ++ vpmax_u8 ++); ++arm_128_v7_neon_impl!( ++ all_mask16x8, ++ any_mask16x8, ++ mask16x8, ++ mask16x4, ++ vpmin_u16, ++ vpmax_u16 ++); ++arm_128_v7_neon_impl!( ++ all_mask32x4, ++ any_mask32x4, ++ mask32x4, ++ mask32x2, ++ vpmin_u32, ++ vpmax_u32 ++); +diff --git a/third_party/rust/encoding_rs/Cargo.toml b/third_party/rust/encoding_rs/Cargo.toml +--- third_party/rust/encoding_rs/Cargo.toml ++++ third_party/rust/encoding_rs/Cargo.toml +@@ -6,18 +6,19 @@ + # to registry (e.g., crates.io) dependencies. + # + # If you are reading this file be aware that the original Cargo.toml + # will likely look very different (and much more reasonable). + # See Cargo.toml.orig for the original contents. + + [package] + edition = "2018" ++rust-version = "1.36" + name = "encoding_rs" +-version = "0.8.33" ++version = "0.8.34" + authors = ["Henri Sivonen "] + description = "A Gecko-oriented implementation of the Encoding Standard" + homepage = "https://docs.rs/encoding_rs/" + documentation = "https://docs.rs/encoding_rs/" + readme = "README.md" + keywords = [ + "encoding", + "web", +@@ -31,23 +32,23 @@ categories = [ + "internationalization", + ] + license = "(Apache-2.0 OR MIT) AND BSD-3-Clause" + repository = "https://github.com/hsivonen/encoding_rs" + + [profile.release] + lto = true + ++[dependencies.any_all_workaround] ++version = "0.1.0" ++optional = true ++ + [dependencies.cfg-if] + version = "1.0" + +-[dependencies.packed_simd] +-version = "0.3.9" +-optional = true +- + [dependencies.serde] + version = "1.0" + optional = true + + [dev-dependencies.bincode] + version = "1.0" + + [dev-dependencies.serde_derive] +@@ -69,15 +70,9 @@ fast-legacy-encode = [ + "fast-hanja-encode", + "fast-kanji-encode", + "fast-gb-hanzi-encode", + "fast-big5-hanzi-encode", + ] + less-slow-big5-hanzi-encode = [] + less-slow-gb-hanzi-encode = [] + less-slow-kanji-encode = [] +-simd-accel = [ +- "packed_simd", +- "packed_simd/into_bits", +-] +- +-[badges.travis-ci] +-repository = "hsivonen/encoding_rs" ++simd-accel = ["any_all_workaround"] +diff --git a/third_party/rust/encoding_rs/README.md b/third_party/rust/encoding_rs/README.md +--- third_party/rust/encoding_rs/README.md ++++ third_party/rust/encoding_rs/README.md +@@ -162,50 +162,36 @@ wrappers. + * [C++](https://github.com/hsivonen/recode_cpp) + + ## Optional features + + There are currently these optional cargo features: + + ### `simd-accel` + +-Enables SIMD acceleration using the nightly-dependent `packed_simd` crate. ++Enables SIMD acceleration using the nightly-dependent `portable_simd` standard ++library feature. + + This is an opt-in feature, because enabling this feature _opts out_ of Rust's + guarantees of future compilers compiling old code (aka. "stability story"). + + Currently, this has not been tested to be an improvement except for these +-targets: ++targets and enabling the `simd-accel` feature is expected to break the build ++on other targets: + + * x86_64 + * i686 + * aarch64 + * thumbv7neon + + If you use nightly Rust, you use targets whose first component is one of the + above, and you are prepared _to have to revise your configuration when updating + Rust_, you should enable this feature. Otherwise, please _do not_ enable this + feature. + +-_Note!_ If you are compiling for a target that does not have 128-bit SIMD +-enabled as part of the target definition and you are enabling 128-bit SIMD +-using `-C target_feature`, you need to enable the `core_arch` Cargo feature +-for `packed_simd` to compile a crates.io snapshot of `core_arch` instead of +-using the standard-library copy of `core::arch`, because the `core::arch` +-module of the pre-compiled standard library has been compiled with the +-assumption that the CPU doesn't have 128-bit SIMD. At present this applies +-mainly to 32-bit ARM targets whose first component does not include the +-substring `neon`. +- +-The encoding_rs side of things has not been properly set up for POWER, +-PowerPC, MIPS, etc., SIMD at this time, so even if you were to follow +-the advice from the previous paragraph, you probably shouldn't use +-the `simd-accel` option on the less mainstream architectures at this +-time. +- + Used by Firefox. + + ### `serde` + + Enables support for serializing and deserializing `&'static Encoding`-typed + struct fields using [Serde][1]. + + [1]: https://serde.rs/ +@@ -376,18 +362,19 @@ It is a goal to support the latest stabl + the version of Rust that's used for Firefox Nightly. + + At this time, there is no firm commitment to support a version older than + what's required by Firefox, and there is no commitment to treat MSRV changes + as semver-breaking, because this crate depends on `cfg-if`, which doesn't + appear to treat MSRV changes as semver-breaking, so it would be useless for + this crate to treat MSRV changes as semver-breaking. + +-As of 2021-02-04, MSRV appears to be Rust 1.36.0 for using the crate and ++As of 2024-04-04, MSRV appears to be Rust 1.36.0 for using the crate and + 1.42.0 for doc tests to pass without errors about the global allocator. ++With the `simd-accel` feature, the MSRV is even higher. + + ## Compatibility with rust-encoding + + A compatibility layer that implements the rust-encoding API on top of + encoding_rs is + [provided as a separate crate](https://github.com/hsivonen/encoding_rs_compat) + (cannot be uploaded to crates.io). The compatibility layer was originally + written with the assuption that Firefox would need it, but it is not currently +@@ -441,20 +428,27 @@ To regenerate the generated code: + - [x] Implement the rust-encoding API in terms of encoding_rs. + - [x] Add SIMD acceleration for Aarch64. + - [x] Investigate the use of NEON on 32-bit ARM. + - [ ] ~Investigate Björn Höhrmann's lookup table acceleration for UTF-8 as + adapted to Rust in rust-encoding.~ + - [x] Add actually fast CJK encode options. + - [ ] ~Investigate [Bob Steagall's lookup table acceleration for UTF-8](https://github.com/BobSteagall/CppNow2018/blob/master/FastConversionFromUTF-8/Fast%20Conversion%20From%20UTF-8%20with%20C%2B%2B%2C%20DFAs%2C%20and%20SSE%20Intrinsics%20-%20Bob%20Steagall%20-%20C%2B%2BNow%202018.pdf).~ + - [x] Provide a build mode that works without `alloc` (with lesser API surface). +-- [ ] Migrate to `std::simd` once it is stable and declare 1.0. ++- [x] Migrate to `std::simd` ~once it is stable and declare 1.0.~ ++- [ ] Migrate `unsafe` slice access by larger types than `u8`/`u16` to `align_to`. + + ## Release Notes + ++### 0.8.34 ++ ++* Use the `portable_simd` nightly feature of the standard library instead of the `packed_simd` crate. Only affects the `simd-accel` optional nightly feature. ++* Internal documentation improvements and minor code improvements around `unsafe`. ++* Added `rust-version` to `Cargo.toml`. ++ + ### 0.8.33 + + * Use `packed_simd` instead of `packed_simd_2` again now that updates are back under the `packed_simd` name. Only affects the `simd-accel` optional nightly feature. + + ### 0.8.32 + + * Removed `build.rs`. (This removal should resolve false positives reported by some antivirus products. This may break some build configurations that have opted out of Rust's guarantees against future build breakage.) + * Internal change to what API is used for reinterpreting the lane configuration of SIMD vectors. +diff --git a/third_party/rust/encoding_rs/src/ascii.rs b/third_party/rust/encoding_rs/src/ascii.rs +--- third_party/rust/encoding_rs/src/ascii.rs ++++ third_party/rust/encoding_rs/src/ascii.rs +@@ -46,71 +46,87 @@ cfg_if! { + #[allow(dead_code)] + #[inline(always)] + fn likely(b: bool) -> bool { + b + } + } + } + ++// Safety invariants for masks: data & mask = 0 for valid ASCII or basic latin utf-16 ++ + // `as` truncates, so works on 32-bit, too. + #[allow(dead_code)] + pub const ASCII_MASK: usize = 0x8080_8080_8080_8080u64 as usize; + + // `as` truncates, so works on 32-bit, too. + #[allow(dead_code)] + pub const BASIC_LATIN_MASK: usize = 0xFF80_FF80_FF80_FF80u64 as usize; + + #[allow(unused_macros)] + macro_rules! ascii_naive { + ($name:ident, $src_unit:ty, $dst_unit:ty) => { ++ /// Safety: src and dst must have len_unit elements and be aligned ++ /// Safety-usable invariant: will return Some() when it fails ++ /// to convert. The first value will be a u8 that is > 127. + #[inline(always)] + pub unsafe fn $name( + src: *const $src_unit, + dst: *mut $dst_unit, + len: usize, + ) -> Option<($src_unit, usize)> { + // Yes, manually omitting the bound check here matters + // a lot for perf. + for i in 0..len { ++ // Safety: len invariant used here + let code_unit = *(src.add(i)); ++ // Safety: Upholds safety-usable invariant here + if code_unit > 127 { + return Some((code_unit, i)); + } ++ // Safety: len invariant used here + *(dst.add(i)) = code_unit as $dst_unit; + } + return None; + } + }; + } + + #[allow(unused_macros)] + macro_rules! ascii_alu { + ($name:ident, ++ // safety invariant: src/dst MUST be u8 + $src_unit:ty, + $dst_unit:ty, ++ // Safety invariant: stride_fn must consume and produce two usizes, and return the index of the first non-ascii when it fails + $stride_fn:ident) => { ++ /// Safety: src and dst must have len elements, src is valid for read, dst is valid for ++ /// write ++ /// Safety-usable invariant: will return Some() when it fails ++ /// to convert. The first value will be a u8 that is > 127. + #[cfg_attr(feature = "cargo-clippy", allow(never_loop, cast_ptr_alignment))] + #[inline(always)] + pub unsafe fn $name( + src: *const $src_unit, + dst: *mut $dst_unit, + len: usize, + ) -> Option<($src_unit, usize)> { + let mut offset = 0usize; + // This loop is only broken out of as a `goto` forward + loop { ++ // Safety: until_alignment becomes the number of bytes we need to munch until we are aligned to usize + let mut until_alignment = { + // Check if the other unit aligns if we move the narrower unit + // to alignment. + // if ::core::mem::size_of::<$src_unit>() == ::core::mem::size_of::<$dst_unit>() { + // ascii_to_ascii + let src_alignment = (src as usize) & ALU_ALIGNMENT_MASK; + let dst_alignment = (dst as usize) & ALU_ALIGNMENT_MASK; + if src_alignment != dst_alignment { ++ // Safety: bails early and ends up in the naïve branch where usize-alignment doesn't matter + break; + } + (ALU_ALIGNMENT - src_alignment) & ALU_ALIGNMENT_MASK + // } else if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() { + // ascii_to_basic_latin + // let src_until_alignment = (ALIGNMENT - ((src as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK; + // if (dst.add(src_until_alignment) as usize) & ALIGNMENT_MASK != 0 { + // break; +@@ -129,74 +145,104 @@ macro_rules! ascii_alu { + // Moving pointers to alignment seems to be a pessimization on + // x86_64 for operations that have UTF-16 as the internal + // Unicode representation. However, since it seems to be a win + // on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except + // mixed results when encoding from UTF-16 and since x86 and + // x86_64 should be using SSE2 in due course, keeping the move + // to alignment here. It would be good to test on more ARM CPUs + // and on real MIPS and POWER hardware. ++ // ++ // Safety: This is the naïve code once again, for `until_alignment` bytes + while until_alignment != 0 { + let code_unit = *(src.add(offset)); + if code_unit > 127 { ++ // Safety: Upholds safety-usable invariant here + return Some((code_unit, offset)); + } + *(dst.add(offset)) = code_unit as $dst_unit; ++ // Safety: offset is the number of bytes copied so far + offset += 1; + until_alignment -= 1; + } + let len_minus_stride = len - ALU_STRIDE_SIZE; + loop { ++ // Safety: num_ascii is known to be a byte index of a non-ascii byte due to stride_fn's invariant + if let Some(num_ascii) = $stride_fn( ++ // Safety: These are known to be valid and aligned since we have at ++ // least ALU_STRIDE_SIZE data in these buffers, and offset is the ++ // number of elements copied so far, which according to the ++ // until_alignment calculation above will cause both src and dst to be ++ // aligned to usize after this add + src.add(offset) as *const usize, + dst.add(offset) as *mut usize, + ) { + offset += num_ascii; ++ // Safety: Upholds safety-usable invariant here by indexing into non-ascii byte + return Some((*(src.add(offset)), offset)); + } ++ // Safety: offset continues to be the number of bytes copied so far, and ++ // maintains usize alignment for the next loop iteration + offset += ALU_STRIDE_SIZE; ++ // Safety: This is `offset > len - stride. This loop will continue as long as ++ // `offset <= len - stride`, which means there are `stride` bytes to still be read. + if offset > len_minus_stride { + break; + } + } + } + break; + } ++ ++ // Safety: This is the naïve code, same as ascii_naive, and has no requirements ++ // other than src/dst being valid for the the right lens + while offset < len { ++ // Safety: len invariant used here + let code_unit = *(src.add(offset)); + if code_unit > 127 { ++ // Safety: Upholds safety-usable invariant here + return Some((code_unit, offset)); + } ++ // Safety: len invariant used here + *(dst.add(offset)) = code_unit as $dst_unit; + offset += 1; + } + None + } + }; + } + + #[allow(unused_macros)] + macro_rules! basic_latin_alu { + ($name:ident, ++ // safety invariant: use u8 for src/dest for ascii, and u16 for basic_latin + $src_unit:ty, + $dst_unit:ty, ++ // safety invariant: stride function must munch ALU_STRIDE_SIZE*size(src_unit) bytes off of src and ++ // write ALU_STRIDE_SIZE*size(dst_unit) bytes to dst + $stride_fn:ident) => { ++ /// Safety: src and dst must have len elements, src is valid for read, dst is valid for ++ /// write ++ /// Safety-usable invariant: will return Some() when it fails ++ /// to convert. The first value will be a u8 that is > 127. + #[cfg_attr( + feature = "cargo-clippy", + allow(never_loop, cast_ptr_alignment, cast_lossless) + )] + #[inline(always)] + pub unsafe fn $name( + src: *const $src_unit, + dst: *mut $dst_unit, + len: usize, + ) -> Option<($src_unit, usize)> { + let mut offset = 0usize; + // This loop is only broken out of as a `goto` forward + loop { ++ // Safety: until_alignment becomes the number of bytes we need to munch from src/dest until we are aligned to usize ++ // We ensure basic-latin has the same alignment as ascii, starting with ascii since it is smaller. + let mut until_alignment = { + // Check if the other unit aligns if we move the narrower unit + // to alignment. + // if ::core::mem::size_of::<$src_unit>() == ::core::mem::size_of::<$dst_unit>() { + // ascii_to_ascii + // let src_alignment = (src as usize) & ALIGNMENT_MASK; + // let dst_alignment = (dst as usize) & ALIGNMENT_MASK; + // if src_alignment != dst_alignment { +@@ -232,66 +278,89 @@ macro_rules! basic_latin_alu { + // Moving pointers to alignment seems to be a pessimization on + // x86_64 for operations that have UTF-16 as the internal + // Unicode representation. However, since it seems to be a win + // on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except + // mixed results when encoding from UTF-16 and since x86 and + // x86_64 should be using SSE2 in due course, keeping the move + // to alignment here. It would be good to test on more ARM CPUs + // and on real MIPS and POWER hardware. ++ // ++ // Safety: This is the naïve code once again, for `until_alignment` bytes + while until_alignment != 0 { + let code_unit = *(src.add(offset)); + if code_unit > 127 { ++ // Safety: Upholds safety-usable invariant here + return Some((code_unit, offset)); + } + *(dst.add(offset)) = code_unit as $dst_unit; ++ // Safety: offset is the number of bytes copied so far + offset += 1; + until_alignment -= 1; + } + let len_minus_stride = len - ALU_STRIDE_SIZE; + loop { + if !$stride_fn( ++ // Safety: These are known to be valid and aligned since we have at ++ // least ALU_STRIDE_SIZE data in these buffers, and offset is the ++ // number of elements copied so far, which according to the ++ // until_alignment calculation above will cause both src and dst to be ++ // aligned to usize after this add + src.add(offset) as *const usize, + dst.add(offset) as *mut usize, + ) { + break; + } ++ // Safety: offset continues to be the number of bytes copied so far, and ++ // maintains usize alignment for the next loop iteration + offset += ALU_STRIDE_SIZE; ++ // Safety: This is `offset > len - stride. This loop will continue as long as ++ // `offset <= len - stride`, which means there are `stride` bytes to still be read. + if offset > len_minus_stride { + break; + } + } + } + break; + } ++ // Safety: This is the naïve code once again, for leftover bytes + while offset < len { ++ // Safety: len invariant used here + let code_unit = *(src.add(offset)); + if code_unit > 127 { ++ // Safety: Upholds safety-usable invariant here + return Some((code_unit, offset)); + } ++ // Safety: len invariant used here + *(dst.add(offset)) = code_unit as $dst_unit; + offset += 1; + } + None + } + }; + } + + #[allow(unused_macros)] + macro_rules! latin1_alu { ++ // safety invariant: stride function must munch ALU_STRIDE_SIZE*size(src_unit) bytes off of src and ++ // write ALU_STRIDE_SIZE*size(dst_unit) bytes to dst + ($name:ident, $src_unit:ty, $dst_unit:ty, $stride_fn:ident) => { ++ /// Safety: src and dst must have len elements, src is valid for read, dst is valid for ++ /// write + #[cfg_attr( + feature = "cargo-clippy", + allow(never_loop, cast_ptr_alignment, cast_lossless) + )] + #[inline(always)] + pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) { + let mut offset = 0usize; + // This loop is only broken out of as a `goto` forward + loop { ++ // Safety: until_alignment becomes the number of bytes we need to munch from src/dest until we are aligned to usize ++ // We ensure the UTF-16 side has the same alignment as the Latin-1 side, starting with Latin-1 since it is smaller. + let mut until_alignment = { + if ::core::mem::size_of::<$src_unit>() < ::core::mem::size_of::<$dst_unit>() { + // unpack + let src_until_alignment = (ALU_ALIGNMENT + - ((src as usize) & ALU_ALIGNMENT_MASK)) + & ALU_ALIGNMENT_MASK; + if (dst.wrapping_add(src_until_alignment) as usize) & ALU_ALIGNMENT_MASK + != 0 +@@ -308,373 +377,485 @@ macro_rules! latin1_alu { + != 0 + { + break; + } + dst_until_alignment + } + }; + if until_alignment + ALU_STRIDE_SIZE <= len { ++ // Safety: This is the naïve code once again, for `until_alignment` bytes + while until_alignment != 0 { + let code_unit = *(src.add(offset)); + *(dst.add(offset)) = code_unit as $dst_unit; ++ // Safety: offset is the number of bytes copied so far + offset += 1; *** 2691 LINES SKIPPED ***