diff --git a/CMake/resolve_dependency_modules/xsimd.cmake b/CMake/resolve_dependency_modules/xsimd.cmake index 56f1bc2ae..62d2cb62a 100644 --- a/CMake/resolve_dependency_modules/xsimd.cmake +++ b/CMake/resolve_dependency_modules/xsimd.cmake @@ -13,9 +13,9 @@ # limitations under the License. include_guard(GLOBAL) -set(VELOX_XSIMD_VERSION 10.0.0) +set(VELOX_XSIMD_VERSION 13.1.0) set(VELOX_XSIMD_BUILD_SHA256_CHECKSUM - 73f818368b3a4dad92fab1b2933d93694241bd2365a6181747b2df1768f6afdd) + 88c9dc6da677feadb40fe09f467659ba0a98e9987f7491d51919ee13d897efa4) set(VELOX_XSIMD_SOURCE_URL "https://github.com/xtensor-stack/xsimd/archive/refs/tags/${VELOX_XSIMD_VERSION}.tar.gz" ) diff --git a/velox/common/base/SimdUtil.h b/velox/common/base/SimdUtil.h index edc7e06c2..024b6aa1d 100644 --- a/velox/common/base/SimdUtil.h +++ b/velox/common/base/SimdUtil.h @@ -25,6 +25,9 @@ namespace facebook::velox::simd { +template +using xbatch = xsimd::batch, uint8_t, T>, A>; + // Return width of the widest store. template constexpr int32_t batchByteSize(const A& = {}) { @@ -363,14 +366,9 @@ xsimd::batch iota(const A& = {}); // Returns a batch with all elements set to value. For batch we // use one bit to represent one element. template -xsimd::batch setAll(T value, const A& = {}) { +xbatch setAll(T value, const A& = {}) { if constexpr (std::is_same_v) { -#if defined(__aarch64__) - return xsimd::batch( - xsimd::broadcast(value ? -1 : 0)); -#else - return xsimd::batch(xsimd::broadcast(value ? -1 : 0)); -#endif + return xsimd::broadcast(value ? 0xFF : 0x00); } else { return xsimd::broadcast(value); } diff --git a/velox/vector/BiasVector-inl.h b/velox/vector/BiasVector-inl.h index 8265ac6ee..d276329e7 100644 --- a/velox/vector/BiasVector-inl.h +++ b/velox/vector/BiasVector-inl.h @@ -111,7 +111,7 @@ const T BiasVector::valueAtFast(vector_size_t idx) const { } template -xsimd::batch BiasVector::loadSIMDValueBufferAt(size_t index) const { +simd::xbatch BiasVector::loadSIMDValueBufferAt(size_t index) const { if constexpr (std::is_same_v) { switch (valueType_) { case TypeKind::INTEGER: diff --git a/velox/vector/BiasVector.h b/velox/vector/BiasVector.h index 2a2e43a6d..c71ee1d3e 100644 --- a/velox/vector/BiasVector.h +++ b/velox/vector/BiasVector.h @@ -130,7 +130,7 @@ class BiasVector : public SimpleVector { * * @param byteOffset - the byte offset to laod from */ - xsimd::batch loadSIMDValueBufferAt(size_t index) const; + simd::xbatch loadSIMDValueBufferAt(size_t index) const; std::unique_ptr> hashAll() const override; @@ -183,10 +183,10 @@ class BiasVector : public SimpleVector { private: template - inline xsimd::batch loadSIMDInternal(size_t byteOffset) const { + inline simd::xbatch loadSIMDInternal(size_t byteOffset) const { auto mem = reinterpret_cast( rawValues_ + byteOffset / sizeof(T) * sizeof(U)); - return xsimd::batch::load_unaligned(mem); + return simd::xbatch::load_unaligned(mem); } TypeKind valueType_; @@ -198,7 +198,7 @@ class BiasVector : public SimpleVector { T bias_; // Used to debias several values at a time. - std::conditional_t, char> biasBuffer_; + std::conditional_t, char> biasBuffer_; }; template diff --git a/velox/vector/ConstantVector.h b/velox/vector/ConstantVector.h index c299c40c2..36319b5af 100644 --- a/velox/vector/ConstantVector.h +++ b/velox/vector/ConstantVector.h @@ -190,7 +190,7 @@ class ConstantVector final : public SimpleVector { /// Loads a 256bit vector of data at the virtual byteOffset given /// Note this method is implemented on each vector type, but is intentionally /// not virtual for performance reasons - xsimd::batch loadSIMDValueBufferAt(size_t /* byteOffset */) const { + simd::xbatch loadSIMDValueBufferAt(size_t /* byteOffset */) const { VELOX_DCHECK(initialized_); return valueBuffer_; } @@ -477,7 +477,7 @@ class ConstantVector final : public SimpleVector { mutable std::atomic wrapInfo_{nullptr}; // This must be at end to avoid memory corruption. - std::conditional_t, char> valueBuffer_; + std::conditional_t, char> valueBuffer_; }; template <> diff --git a/velox/vector/DictionaryVector-inl.h b/velox/vector/DictionaryVector-inl.h index 98f12713f..da8bc8b6f 100644 --- a/velox/vector/DictionaryVector-inl.h +++ b/velox/vector/DictionaryVector-inl.h @@ -160,10 +160,10 @@ std::unique_ptr> DictionaryVector::hashAll() const { } template -xsimd::batch DictionaryVector::loadSIMDValueBufferAt( +simd::xbatch DictionaryVector::loadSIMDValueBufferAt( size_t byteOffset) const { if constexpr (can_simd) { - constexpr int N = xsimd::batch::size; + constexpr int N = simd::xbatch::size; alignas(xsimd::default_arch::alignment()) T tmp[N]; auto startIndex = byteOffset / sizeof(T); for (int i = 0; i < N; ++i) { diff --git a/velox/vector/DictionaryVector.h b/velox/vector/DictionaryVector.h index 2363b8b0d..66c810eb2 100644 --- a/velox/vector/DictionaryVector.h +++ b/velox/vector/DictionaryVector.h @@ -116,7 +116,7 @@ class DictionaryVector : public SimpleVector { * @param index at which to start the vector load * @return the vector of values starting at the given index */ - xsimd::batch loadSIMDValueBufferAt(size_t index) const; + simd::xbatch loadSIMDValueBufferAt(size_t index) const; inline const BufferPtr& indices() const { return indices_; diff --git a/velox/vector/FlatVector-inl.h b/velox/vector/FlatVector-inl.h index defc10355..e7c19b35e 100644 --- a/velox/vector/FlatVector-inl.h +++ b/velox/vector/FlatVector-inl.h @@ -68,10 +68,10 @@ Range FlatVector::asRange() const { } template -xsimd::batch FlatVector::loadSIMDValueBufferAt(size_t byteOffset) const { +simd::xbatch FlatVector::loadSIMDValueBufferAt(size_t byteOffset) const { auto mem = reinterpret_cast(rawValues_) + byteOffset; if constexpr (std::is_same_v) { - return xsimd::batch(xsimd::load_unaligned(mem)); + return xsimd::load_unaligned(mem); } else { return xsimd::load_unaligned(reinterpret_cast(mem)); } @@ -122,7 +122,7 @@ bool FlatVector::useSimdEquality(size_t numCmpVals) const { // whether or not to pursue the SIMD path or the fallback path. auto fallbackCost = SET_CMP_COST * BaseVector::length_; auto simdCost = SIMD_CMP_COST * numCmpVals * BaseVector::length_ / - xsimd::batch::size; + simd::xbatch::size; return simdCost <= fallbackCost; } } diff --git a/velox/vector/FlatVector.h b/velox/vector/FlatVector.h index 04c81640c..238eb21e8 100644 --- a/velox/vector/FlatVector.h +++ b/velox/vector/FlatVector.h @@ -124,7 +124,7 @@ class FlatVector final : public SimpleVector { /// Note this method is implemented on each vector type, but is intentionally /// not virtual for performance reasons. /// 'index' indicates the byte offset to load from - xsimd::batch loadSIMDValueBufferAt(size_t index) const; + simd::xbatch loadSIMDValueBufferAt(size_t index) const; /// dictionary vector makes internal usehere for SIMD functions template diff --git a/velox/vector/SequenceVector-inl.h b/velox/vector/SequenceVector-inl.h index 27e47271c..1f84a7d00 100644 --- a/velox/vector/SequenceVector-inl.h +++ b/velox/vector/SequenceVector-inl.h @@ -120,13 +120,13 @@ std::unique_ptr> SequenceVector::hashAll() const { } template -xsimd::batch SequenceVector::loadSIMDValueBufferAt( +simd::xbatch SequenceVector::loadSIMDValueBufferAt( size_t byteOffset) const { if constexpr (std::is_same_v) { throw std::runtime_error( "Sequence encoding only supports SIMD operations on integers"); } else { - constexpr int kBatchSize = xsimd::batch::size; + constexpr int kBatchSize = simd::xbatch::size; auto startIndex = byteOffset / sizeof(T); if (checkLoadRange(startIndex, kBatchSize)) { return simd::setAll(valueAtFast(startIndex)); diff --git a/velox/vector/SequenceVector.h b/velox/vector/SequenceVector.h index ab277ef78..07f3a9419 100644 --- a/velox/vector/SequenceVector.h +++ b/velox/vector/SequenceVector.h @@ -96,7 +96,7 @@ class SequenceVector : public SimpleVector { * * @param byteOffset - the byte offset to laod from */ - xsimd::batch loadSIMDValueBufferAt(size_t index) const; + simd::xbatch loadSIMDValueBufferAt(size_t index) const; /** * Returns a shared_ptr to the underlying byte buffer holding the values for diff --git a/velox/vector/tests/SimpleVectorTest.cpp b/velox/vector/tests/SimpleVectorTest.cpp index 9f2e30af9..f90d44643 100644 --- a/velox/vector/tests/SimpleVectorTest.cpp +++ b/velox/vector/tests/SimpleVectorTest.cpp @@ -747,14 +747,14 @@ TYPED_TEST(SimpleVectorCompareTest, compareDescNullsLast) { this->runTest({/*nullsFirst=*/false, /*ascending=*/false}); } -template -inline T simd256_extract_value(xsimd::batch simdValue) { +template +inline T simd256_extract_value(xsimd::batch simdValue) { if constexpr (std::is_same_v) { static_assert(offset < 256); auto byte = xsimd::batch(simdValue).get(offset / 8); return byte & (1 << (offset % 8)); } else if constexpr (std::is_integral_v) { - static_assert(offset < xsimd::batch::size); + static_assert(offset < xsimd::batch::size); return simdValue.get(offset); } else { VELOX_UNSUPPORTED( @@ -770,7 +770,7 @@ template struct AssertSimdElement { static void eq( const std::vector>& expected, - xsimd::batch simdBuffer, + simd::xbatch simdBuffer, size_t base) { static_assert(i >= 0); if (base + i < expected.size()) { @@ -806,7 +806,7 @@ struct CanSimd { }; template -xsimd::batch loadSIMDValueBufferAt( +simd::xbatch loadSIMDValueBufferAt( const SimpleVector* outVector, size_t byteOffset) { switch (outVector->encoding()) { @@ -886,16 +886,19 @@ class SimpleVectorSimdTypedTest : public SimpleVectorTest { 2008 /* seed */); auto vector = maker_.encodedVector(encode, expected.data()); - constexpr auto width = xsimd::batch::size; + + constexpr bool is_bool = std::is_same_v; + constexpr auto width = simd::xbatch::size; + // Though sizeof(bool) = 1, while a bool only occupies 1 bit in + // SIMD. + constexpr auto element_bitwidth = is_bool ? 1 : sizeof(T) * 8; // TODO T71293360: determine SIMD behavior when index + SIMD register // width result exceeds the vector length for (size_t base = 0; base + width < vector->size(); base += width) { auto simdBuffer = loadSIMDValueBufferAt( static_cast*>(vector.get()), - // Though sizeof(bool) = 1, while a bool only occupies 1 bit in - // SIMD. - std::is_same_v ? base / 8 : base * sizeof(T)); + (base * element_bitwidth) / 8); AssertSimdElement::eq( expected.data(), simdBuffer, base); }