xsimd.patch

   1 diff --git a/dom/media/webaudio/AudioNodeEngineGeneric.h b/dom/media/webaudio/AudioNodeEngineGeneric.h
   2 --- a/dom/media/webaudio/AudioNodeEngineGeneric.h
   3 +++ b/dom/media/webaudio/AudioNodeEngineGeneric.h
   4 @@ -203,14 +203,14 @@
   5      MOZ_ASSERT((aSize % xsimd::batch<float, Arch>::size == 0),
   6                 "requires tail processing");
   7
   8      MOZ_UNROLL(2)
   9      for (unsigned i = 0; i < aSize * 2;
  10 -         i += 2 * xsimd::batch<std::complex<float>>::size) {
  11 -      auto in1 = xsimd::batch<std::complex<float>>::load_aligned(
  12 +         i += 2 * xsimd::batch<std::complex<float>, Arch>::size) {
  13 +      auto in1 = xsimd::batch<std::complex<float>, Arch>::load_aligned(
  14            reinterpret_cast<const std::complex<float>*>(&aInput[i]));
  15 -      auto in2 = xsimd::batch<std::complex<float>>::load_aligned(
  16 +      auto in2 = xsimd::batch<std::complex<float>, Arch>::load_aligned(
  17            reinterpret_cast<const std::complex<float>*>(&aScale[i]));
  18        auto out = in1 * in2;
  19        out.store_aligned(reinterpret_cast<std::complex<float>*>(&aOutput[i]));
  20      }
  21    };
  22
  23 diff --git a/dom/media/webaudio/AudioNodeEngineGeneric.h b/dom/media/webaudio/AudioNodeEngineGeneric.h
  24 --- a/dom/media/webaudio/AudioNodeEngineGeneric.h
  25 +++ b/dom/media/webaudio/AudioNodeEngineGeneric.h
  26 @@ -5,331 +5,54 @@
  27
  28  #ifndef MOZILLA_AUDIONODEENGINEGENERIC_H_
  29  #define MOZILLA_AUDIONODEENGINEGENERIC_H_
  30
  31  #include "AudioNodeEngine.h"
  32 -#include "AlignmentUtils.h"
  33
  34  #include "xsimd/xsimd.hpp"
  35
  36 -#if defined(__GNUC__) && __GNUC__ > 7
  37 -#  define MOZ_PRAGMA(tokens) _Pragma(#tokens)
  38 -#  define MOZ_UNROLL(factor) MOZ_PRAGMA(GCC unroll factor)
  39 -#elif defined(__INTEL_COMPILER) || (defined(__clang__) && __clang_major__ > 3)
  40 -#  define MOZ_PRAGMA(tokens) _Pragma(#tokens)
  41 -#  define MOZ_UNROLL(factor) MOZ_PRAGMA(unroll factor)
  42 -#else
  43 -#  define MOZ_UNROLL(_)
  44 -#endif
  45 -
  46  namespace mozilla {
  47
  48  template <class Arch>
  49 -static bool is_aligned(const void* ptr) {
  50 -  return (reinterpret_cast<uintptr_t>(ptr) &
  51 -          ~(static_cast<uintptr_t>(Arch::alignment()) - 1)) ==
  52 -         reinterpret_cast<uintptr_t>(ptr);
  53 -};
  54 -
  55 -template <class Arch>
  56  struct Engine {
  57    static void AudioBufferAddWithScale(const float* aInput, float aScale,
  58 -                                      float* aOutput, uint32_t aSize) {
  59 -    if constexpr (Arch::requires_alignment()) {
  60 -      if (aScale == 1.0f) {
  61 -        while (!is_aligned<Arch>(aInput) || !is_aligned<Arch>(aOutput)) {
  62 -          if (!aSize) return;
  63 -          *aOutput += *aInput;
  64 -          ++aOutput;
  65 -          ++aInput;
  66 -          --aSize;
  67 -        }
  68 -      } else {
  69 -        while (!is_aligned<Arch>(aInput) || !is_aligned<Arch>(aOutput)) {
  70 -          if (!aSize) return;
  71 -          *aOutput += *aInput * aScale;
  72 -          ++aOutput;
  73 -          ++aInput;
  74 -          --aSize;
  75 -        }
  76 -      }
  77 -    }
  78 -    MOZ_ASSERT(is_aligned<Arch>(aInput), "aInput is aligned");
  79 -    MOZ_ASSERT(is_aligned<Arch>(aOutput), "aOutput is aligned");
  80 -
  81 -    xsimd::batch<float, Arch> vgain(aScale);
  82 -
  83 -    uint32_t aVSize = aSize & ~(xsimd::batch<float, Arch>::size - 1);
  84 -    MOZ_UNROLL(4)
  85 -    for (unsigned i = 0; i < aVSize; i += xsimd::batch<float, Arch>::size) {
  86 -      auto vin1 = xsimd::batch<float, Arch>::load_aligned(&aInput[i]);
  87 -      auto vin2 = xsimd::batch<float, Arch>::load_aligned(&aOutput[i]);
  88 -      auto vout = xsimd::fma(vin1, vgain, vin2);
  89 -      vout.store_aligned(&aOutput[i]);
  90 -    }
  91 -
  92 -    for (unsigned i = aVSize; i < aSize; ++i) {
  93 -      aOutput[i] += aInput[i] * aScale;
  94 -    }
  95 -  };
  96 +                                      float* aOutput, uint32_t aSize);
  97
  98    static void AudioBlockCopyChannelWithScale(const float* aInput, float aScale,
  99 -                                             float* aOutput) {
 100 -    MOZ_ASSERT(is_aligned<Arch>(aInput), "aInput is aligned");
 101 -    MOZ_ASSERT(is_aligned<Arch>(aOutput), "aOutput is aligned");
 102 -
 103 -    MOZ_ASSERT((WEBAUDIO_BLOCK_SIZE % xsimd::batch<float, Arch>::size == 0),
 104 -               "requires tail processing");
 105 -
 106 -    xsimd::batch<float, Arch> vgain = (aScale);
 107 -
 108 -    MOZ_UNROLL(4)
 109 -    for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
 110 -         i += xsimd::batch<float, Arch>::size) {
 111 -      auto vin = xsimd::batch<float, Arch>::load_aligned(&aInput[i]);
 112 -      auto vout = vin * vgain;
 113 -      vout.store_aligned(&aOutput[i]);
 114 -    }
 115 -  };
 116 +                                             float* aOutput);
 117
 118    static void AudioBlockCopyChannelWithScale(
 119        const float aInput[WEBAUDIO_BLOCK_SIZE],
 120        const float aScale[WEBAUDIO_BLOCK_SIZE],
 121 -      float aOutput[WEBAUDIO_BLOCK_SIZE]) {
 122 -    MOZ_ASSERT(is_aligned<Arch>(aInput), "aInput is aligned");
 123 -    MOZ_ASSERT(is_aligned<Arch>(aOutput), "aOutput is aligned");
 124 -    MOZ_ASSERT(is_aligned<Arch>(aScale), "aScale is aligned");
 125 -
 126 -    MOZ_ASSERT((WEBAUDIO_BLOCK_SIZE % xsimd::batch<float, Arch>::size == 0),
 127 -               "requires tail processing");
 128 -
 129 -    MOZ_UNROLL(4)
 130 -    for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
 131 -         i += xsimd::batch<float, Arch>::size) {
 132 -      auto vscaled = xsimd::batch<float, Arch>::load_aligned(&aScale[i]);
 133 -      auto vin = xsimd::batch<float, Arch>::load_aligned(&aInput[i]);
 134 -      auto vout = vin * vscaled;
 135 -      vout.store_aligned(&aOutput[i]);
 136 -    }
 137 -  };
 138 +      float aOutput[WEBAUDIO_BLOCK_SIZE]);
 139
 140    static void AudioBufferInPlaceScale(float* aBlock, float aScale,
 141 -                                      uint32_t aSize) {
 142 -    MOZ_ASSERT(is_aligned<Arch>(aBlock), "aBlock is aligned");
 143 -
 144 -    xsimd::batch<float, Arch> vgain(aScale);
 145 -
 146 -    uint32_t aVSize = aSize & ~(xsimd::batch<float, Arch>::size - 1);
 147 -    MOZ_UNROLL(4)
 148 -    for (unsigned i = 0; i < aVSize; i += xsimd::batch<float, Arch>::size) {
 149 -      auto vin = xsimd::batch<float, Arch>::load_aligned(&aBlock[i]);
 150 -      auto vout = vin * vgain;
 151 -      vout.store_aligned(&aBlock[i]);
 152 -    }
 153 -    for (unsigned i = aVSize; i < aSize; ++i) aBlock[i] *= aScale;
 154 -  };
 155 +                                      uint32_t aSize);
 156
 157    static void AudioBufferInPlaceScale(float* aBlock, float* aScale,
 158 -                                      uint32_t aSize) {
 159 -    MOZ_ASSERT(is_aligned<Arch>(aBlock), "aBlock is aligned");
 160 -    MOZ_ASSERT(is_aligned<Arch>(aScale), "aScale is aligned");
 161 -
 162 -    uint32_t aVSize = aSize & ~(xsimd::batch<float, Arch>::size - 1);
 163 -    MOZ_UNROLL(4)
 164 -    for (unsigned i = 0; i < aVSize; i += xsimd::batch<float, Arch>::size) {
 165 -      auto vin = xsimd::batch<float, Arch>::load_aligned(&aBlock[i]);
 166 -      auto vgain = xsimd::batch<float, Arch>::load_aligned(&aScale[i]);
 167 -      auto vout = vin * vgain;
 168 -      vout.store_aligned(&aBlock[i]);
 169 -    }
 170 -    for (uint32_t i = aVSize; i < aSize; ++i) {
 171 -      *aBlock++ *= *aScale++;
 172 -    }
 173 -  };
 174 +                                      uint32_t aSize);
 175
 176    static void AudioBlockPanStereoToStereo(
 177        const float aInputL[WEBAUDIO_BLOCK_SIZE],
 178        const float aInputR[WEBAUDIO_BLOCK_SIZE], float aGainL, float aGainR,
 179        bool aIsOnTheLeft, float aOutputL[WEBAUDIO_BLOCK_SIZE],
 180 -      float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
 181 -    MOZ_ASSERT(is_aligned<Arch>(aInputL), "aInputL is aligned");
 182 -    MOZ_ASSERT(is_aligned<Arch>(aInputR), "aInputR is aligned");
 183 -    MOZ_ASSERT(is_aligned<Arch>(aOutputL), "aOutputL is aligned");
 184 -    MOZ_ASSERT(is_aligned<Arch>(aOutputR), "aOutputR is aligned");
 185 -
 186 -    MOZ_ASSERT((WEBAUDIO_BLOCK_SIZE % xsimd::batch<float, Arch>::size == 0),
 187 -               "requires tail processing");
 188 -
 189 -    xsimd::batch<float, Arch> vgainl(aGainL);
 190 -    xsimd::batch<float, Arch> vgainr(aGainR);
 191 -
 192 -    if (aIsOnTheLeft) {
 193 -      MOZ_UNROLL(2)
 194 -      for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
 195 -           i += xsimd::batch<float, Arch>::size) {
 196 -        auto vinl = xsimd::batch<float, Arch>::load_aligned(&aInputL[i]);
 197 -        auto vinr = xsimd::batch<float, Arch>::load_aligned(&aInputR[i]);
 198 -
 199 -        /* left channel : aOutputL  = aInputL + aInputR * gainL */
 200 -        auto vout = xsimd::fma(vinr, vgainl, vinl);
 201 -        vout.store_aligned(&aOutputL[i]);
 202 -
 203 -        /* right channel : aOutputR = aInputR * gainR */
 204 -        auto vscaled = vinr * vgainr;
 205 -        vscaled.store_aligned(&aOutputR[i]);
 206 -      }
 207 -    } else {
 208 -      MOZ_UNROLL(2)
 209 -      for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
 210 -           i += xsimd::batch<float, Arch>::size) {
 211 -        auto vinl = xsimd::batch<float, Arch>::load_aligned(&aInputL[i]);
 212 -        auto vinr = xsimd::batch<float, Arch>::load_aligned(&aInputR[i]);
 213 -
 214 -        /* left channel : aInputL * gainL */
 215 -        auto vscaled = vinl * vgainl;
 216 -        vscaled.store_aligned(&aOutputL[i]);
 217 -
 218 -        /* right channel: aOutputR = aInputR + aInputL * gainR */
 219 -        auto vout = xsimd::fma(vinl, vgainr, vinr);
 220 -        vout.store_aligned(&aOutputR[i]);
 221 -      }
 222 -    }
 223 -  };
 224 +      float aOutputR[WEBAUDIO_BLOCK_SIZE]);
 225
 226    static void BufferComplexMultiply(const float* aInput, const float* aScale,
 227 -                                    float* aOutput, uint32_t aSize) {
 228 -    MOZ_ASSERT(is_aligned<Arch>(aInput), "aInput is aligned");
 229 -    MOZ_ASSERT(is_aligned<Arch>(aOutput), "aOutput is aligned");
 230 -    MOZ_ASSERT(is_aligned<Arch>(aScale), "aScale is aligned");
 231 -    MOZ_ASSERT((aSize % xsimd::batch<float, Arch>::size == 0),
 232 -               "requires tail processing");
 233 -
 234 -    MOZ_UNROLL(2)
 235 -    for (unsigned i = 0; i < aSize * 2;
 236 -         i += 2 * xsimd::batch<std::complex<float>, Arch>::size) {
 237 -      auto in1 = xsimd::batch<std::complex<float>, Arch>::load_aligned(
 238 -          reinterpret_cast<const std::complex<float>*>(&aInput[i]));
 239 -      auto in2 = xsimd::batch<std::complex<float>, Arch>::load_aligned(
 240 -          reinterpret_cast<const std::complex<float>*>(&aScale[i]));
 241 -      auto out = in1 * in2;
 242 -      out.store_aligned(reinterpret_cast<std::complex<float>*>(&aOutput[i]));
 243 -    }
 244 -  };
 245 -
 246 -  static float AudioBufferSumOfSquares(const float* aInput, uint32_t aLength) {
 247 -    float sum = 0.f;
 248 -
 249 -    if constexpr (Arch::requires_alignment()) {
 250 -      while (!is_aligned<Arch>(aInput)) {
 251 -        if (!aLength) {
 252 -          return sum;
 253 -        }
 254 -        sum += *aInput * *aInput;
 255 -        ++aInput;
 256 -        --aLength;
 257 -      }
 258 -    }
 259 -
 260 -    MOZ_ASSERT(is_aligned<Arch>(aInput), "aInput is aligned");
 261 -
 262 -    constexpr uint32_t unroll_factor = 4;
 263 -    xsimd::batch<float, Arch> accs[unroll_factor] = {0.f, 0.f, 0.f, 0.f};
 264 -
 265 -    uint32_t vLength =
 266 -        aLength & ~(unroll_factor * xsimd::batch<float, Arch>::size - 1);
 267 +                                    float* aOutput, uint32_t aSize);
 268
 269 -    for (uint32_t i = 0; i < vLength;
 270 -         i += unroll_factor * xsimd::batch<float, Arch>::size) {
 271 -      MOZ_UNROLL(4)
 272 -      for (uint32_t j = 0; j < unroll_factor; ++j) {
 273 -        auto in = xsimd::batch<float, Arch>::load_aligned(
 274 -            &aInput[i + xsimd::batch<float, Arch>::size * j]);
 275 -        accs[j] = xsimd::fma(in, in, accs[j]);
 276 -      }
 277 -    }
 278 -
 279 -    sum += reduce_add((accs[0] + accs[1]) + (accs[2] + accs[3]));
 280 -    for (uint32_t i = vLength; i < aLength; ++i) sum += aInput[i] * aInput[i];
 281 -    return sum;
 282 -  };
 283 +  static float AudioBufferSumOfSquares(const float* aInput, uint32_t aLength);
 284
 285 -  static void NaNToZeroInPlace(float* aSamples, size_t aCount) {
 286 -    if constexpr (Arch::requires_alignment()) {
 287 -      while (!is_aligned<Arch>(aSamples)) {
 288 -        if (!aCount) {
 289 -          return;
 290 -        }
 291 -        if (*aSamples != *aSamples) {
 292 -          *aSamples = 0.0;
 293 -        }
 294 -        ++aSamples;
 295 -        --aCount;
 296 -      }
 297 -    }
 298 -
 299 -    MOZ_ASSERT(is_aligned<Arch>(aSamples), "aSamples is aligned");
 300 -
 301 -    uint32_t vCount = aCount & ~(xsimd::batch<float, Arch>::size - 1);
 302 -
 303 -    MOZ_UNROLL(4)
 304 -    for (uint32_t i = 0; i < vCount; i += xsimd::batch<float, Arch>::size) {
 305 -      auto vin = xsimd::batch<float, Arch>::load_aligned(&aSamples[i]);
 306 -      auto vout =
 307 -          xsimd::select(xsimd::isnan(vin), xsimd::batch<float, Arch>(0.f), vin);
 308 -      vout.store_aligned(&aSamples[i]);
 309 -    }
 310 -
 311 -    for (uint32_t i = vCount; i < aCount; i++) {
 312 -      if (aSamples[i] != aSamples[i]) {
 313 -        aSamples[i] = 0.0;
 314 -      }
 315 -    }
 316 -  };
 317 +  static void NaNToZeroInPlace(float* aSamples, size_t aCount);
 318
 319    static void AudioBlockPanStereoToStereo(
 320        const float aInputL[WEBAUDIO_BLOCK_SIZE],
 321        const float aInputR[WEBAUDIO_BLOCK_SIZE],
 322        const float aGainL[WEBAUDIO_BLOCK_SIZE],
 323        const float aGainR[WEBAUDIO_BLOCK_SIZE],
 324        const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE],
 325 -      float aOutputL[WEBAUDIO_BLOCK_SIZE],
 326 -      float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
 327 -    MOZ_ASSERT(is_aligned<Arch>(aInputL), "aInputL is aligned");
 328 -    MOZ_ASSERT(is_aligned<Arch>(aInputR), "aInputR is aligned");
 329 -    MOZ_ASSERT(is_aligned<Arch>(aGainL), "aGainL is aligned");
 330 -    MOZ_ASSERT(is_aligned<Arch>(aGainR), "aGainR is aligned");
 331 -    MOZ_ASSERT(is_aligned<Arch>(aIsOnTheLeft), "aIsOnTheLeft is aligned");
 332 -    MOZ_ASSERT(is_aligned<Arch>(aOutputL), "aOutputL is aligned");
 333 -    MOZ_ASSERT(is_aligned<Arch>(aOutputR), "aOutputR is aligned");
 334 -
 335 -    MOZ_ASSERT((WEBAUDIO_BLOCK_SIZE % xsimd::batch<float, Arch>::size == 0),
 336 -               "requires tail processing");
 337 -
 338 -    MOZ_UNROLL(2)
 339 -    for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE;
 340 -         i += xsimd::batch<float, Arch>::size) {
 341 -      auto mask =
 342 -          xsimd::batch_bool<float, Arch>::load_aligned(&aIsOnTheLeft[i]);
 343 -
 344 -      auto inputL = xsimd::batch<float, Arch>::load_aligned(&aInputL[i]);
 345 -      auto inputR = xsimd::batch<float, Arch>::load_aligned(&aInputR[i]);
 346 -      auto gainL = xsimd::batch<float, Arch>::load_aligned(&aGainL[i]);
 347 -      auto gainR = xsimd::batch<float, Arch>::load_aligned(&aGainR[i]);
 348 -
 349 -      auto outL_true = xsimd::fma(inputR, gainL, inputL);
 350 -      auto outR_true = inputR * gainR;
 351 -
 352 -      auto outL_false = inputL * gainL;
 353 -      auto outR_false = xsimd::fma(inputL, gainR, inputR);
 354 -
 355 -      auto outL = xsimd::select(mask, outL_true, outL_false);
 356 -      auto outR = xsimd::select(mask, outR_true, outR_false);
 357 -
 358 -      outL.store_aligned(&aOutputL[i]);
 359 -      outR.store_aligned(&aOutputR[i]);
 360 -    }
 361 -  }
 362 +      float aOutputL[WEBAUDIO_BLOCK_SIZE], float aOutputR[WEBAUDIO_BLOCK_SIZE]);
 363  };
 364
 365  }  // namespace mozilla
 366
 367  #endif
 368 diff --git a/dom/media/webaudio/AudioNodeEngineGenericImpl.h b/dom/media/webaudio/AudioNodeEngineGenericImpl.h
 369 new file mode 100644
 370 --- /dev/null
 371 +++ b/dom/media/webaudio/AudioNodeEngineGenericImpl.h
 372 @@ -0,0 +1,341 @@
 373 +/* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 374 +/* this source code form is subject to the terms of the mozilla public
 375 + * license, v. 2.0. if a copy of the mpl was not distributed with this file,
 376 + * You can obtain one at http://mozilla.org/MPL/2.0/. */
 377 +
 378 +#ifndef MOZILLA_AUDIONODEENGINEGENERICIMPL_H_
 379 +#define MOZILLA_AUDIONODEENGINEGENERICIMPL_H_
 380 +
 381 +#include "AudioNodeEngineGeneric.h"
 382 +#include "AlignmentUtils.h"
 383 +
 384 +#if defined(__GNUC__) && __GNUC__ > 7
 385 +#  define MOZ_PRAGMA(tokens) _Pragma(#tokens)
 386 +#  define MOZ_UNROLL(factor) MOZ_PRAGMA(GCC unroll factor)
 387 +#elif defined(__INTEL_COMPILER) || (defined(__clang__) && __clang_major__ > 3)
 388 +#  define MOZ_PRAGMA(tokens) _Pragma(#tokens)
 389 +#  define MOZ_UNROLL(factor) MOZ_PRAGMA(unroll factor)
 390 +#else
 391 +#  define MOZ_UNROLL(_)
 392 +#endif
 393 +
 394 +namespace mozilla {
 395 +
 396 +template <class Arch>
 397 +static bool is_aligned(const void* ptr) {
 398 +  return (reinterpret_cast<uintptr_t>(ptr) &
 399 +          ~(static_cast<uintptr_t>(Arch::alignment()) - 1)) ==
 400 +         reinterpret_cast<uintptr_t>(ptr);
 401 +};
 402 +
 403 +template <class Arch>
 404 +void Engine<Arch>::AudioBufferAddWithScale(const float* aInput, float aScale,
 405 +                                           float* aOutput, uint32_t aSize) {
 406 +  if constexpr (Arch::requires_alignment()) {
 407 +    if (aScale == 1.0f) {
 408 +      while (!is_aligned<Arch>(aInput) || !is_aligned<Arch>(aOutput)) {
 409 +        if (!aSize) return;
 410 +        *aOutput += *aInput;
 411 +        ++aOutput;
 412 +        ++aInput;
 413 +        --aSize;
 414 +      }
 415 +    } else {
 416 +      while (!is_aligned<Arch>(aInput) || !is_aligned<Arch>(aOutput)) {
 417 +        if (!aSize) return;
 418 +        *aOutput += *aInput * aScale;
 419 +        ++aOutput;
 420 +        ++aInput;
 421 +        --aSize;
 422 +      }
 423 +    }
 424 +  }
 425 +  MOZ_ASSERT(is_aligned<Arch>(aInput), "aInput is aligned");
 426 +  MOZ_ASSERT(is_aligned<Arch>(aOutput), "aOutput is aligned");
 427 +
 428 +  xsimd::batch<float, Arch> vgain(aScale);
 429 +
 430 +  uint32_t aVSize = aSize & ~(xsimd::batch<float, Arch>::size - 1);
 431 +  MOZ_UNROLL(4)
 432 +  for (unsigned i = 0; i < aVSize; i += xsimd::batch<float, Arch>::size) {
 433 +    auto vin1 = xsimd::batch<float, Arch>::load_aligned(&aInput[i]);
 434 +    auto vin2 = xsimd::batch<float, Arch>::load_aligned(&aOutput[i]);
 435 +    auto vout = xsimd::fma(vin1, vgain, vin2);
 436 +    vout.store_aligned(&aOutput[i]);
 437 +  }
 438 +
 439 +  for (unsigned i = aVSize; i < aSize; ++i) {
 440 +    aOutput[i] += aInput[i] * aScale;
 441 +  }
 442 +}
 443 +
 444 +template <class Arch>
 445 +void Engine<Arch>::AudioBlockCopyChannelWithScale(const float* aInput,
 446 +                                                  float aScale,
 447 +                                                  float* aOutput) {
 448 +  MOZ_ASSERT(is_aligned<Arch>(aInput), "aInput is aligned");
 449 +  MOZ_ASSERT(is_aligned<Arch>(aOutput), "aOutput is aligned");
 450 +
 451 +  MOZ_ASSERT((WEBAUDIO_BLOCK_SIZE % xsimd::batch<float, Arch>::size == 0),
 452 +             "requires tail processing");
 453 +
 454 +  xsimd::batch<float, Arch> vgain = (aScale);
 455 +
 456 +  MOZ_UNROLL(4)
 457 +  for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
 458 +       i += xsimd::batch<float, Arch>::size) {
 459 +    auto vin = xsimd::batch<float, Arch>::load_aligned(&aInput[i]);
 460 +    auto vout = vin * vgain;
 461 +    vout.store_aligned(&aOutput[i]);
 462 +  }
 463 +};
 464 +
 465 +template <class Arch>
 466 +void Engine<Arch>::AudioBlockCopyChannelWithScale(
 467 +    const float aInput[WEBAUDIO_BLOCK_SIZE],
 468 +    const float aScale[WEBAUDIO_BLOCK_SIZE],
 469 +    float aOutput[WEBAUDIO_BLOCK_SIZE]) {
 470 +  MOZ_ASSERT(is_aligned<Arch>(aInput), "aInput is aligned");
 471 +  MOZ_ASSERT(is_aligned<Arch>(aOutput), "aOutput is aligned");
 472 +  MOZ_ASSERT(is_aligned<Arch>(aScale), "aScale is aligned");
 473 +
 474 +  MOZ_ASSERT((WEBAUDIO_BLOCK_SIZE % xsimd::batch<float, Arch>::size == 0),
 475 +             "requires tail processing");
 476 +
 477 +  MOZ_UNROLL(4)
 478 +  for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
 479 +       i += xsimd::batch<float, Arch>::size) {
 480 +    auto vscaled = xsimd::batch<float, Arch>::load_aligned(&aScale[i]);
 481 +    auto vin = xsimd::batch<float, Arch>::load_aligned(&aInput[i]);
 482 +    auto vout = vin * vscaled;
 483 +    vout.store_aligned(&aOutput[i]);
 484 +  }
 485 +};
 486 +
 487 +template <class Arch>
 488 +void Engine<Arch>::AudioBufferInPlaceScale(float* aBlock, float aScale,
 489 +                                           uint32_t aSize) {
 490 +  MOZ_ASSERT(is_aligned<Arch>(aBlock), "aBlock is aligned");
 491 +
 492 +  xsimd::batch<float, Arch> vgain(aScale);
 493 +
 494 +  uint32_t aVSize = aSize & ~(xsimd::batch<float, Arch>::size - 1);
 495 +  MOZ_UNROLL(4)
 496 +  for (unsigned i = 0; i < aVSize; i += xsimd::batch<float, Arch>::size) {
 497 +    auto vin = xsimd::batch<float, Arch>::load_aligned(&aBlock[i]);
 498 +    auto vout = vin * vgain;
 499 +    vout.store_aligned(&aBlock[i]);
 500 +  }
 501 +  for (unsigned i = aVSize; i < aSize; ++i) aBlock[i] *= aScale;
 502 +};
 503 +
 504 +template <class Arch>
 505 +void Engine<Arch>::AudioBufferInPlaceScale(float* aBlock, float* aScale,
 506 +                                           uint32_t aSize) {
 507 +  MOZ_ASSERT(is_aligned<Arch>(aBlock), "aBlock is aligned");
 508 +  MOZ_ASSERT(is_aligned<Arch>(aScale), "aScale is aligned");
 509 +
 510 +  uint32_t aVSize = aSize & ~(xsimd::batch<float, Arch>::size - 1);
 511 +  MOZ_UNROLL(4)
 512 +  for (unsigned i = 0; i < aVSize; i += xsimd::batch<float, Arch>::size) {
 513 +    auto vin = xsimd::batch<float, Arch>::load_aligned(&aBlock[i]);
 514 +    auto vgain = xsimd::batch<float, Arch>::load_aligned(&aScale[i]);
 515 +    auto vout = vin * vgain;
 516 +    vout.store_aligned(&aBlock[i]);
 517 +  }
 518 +  for (uint32_t i = aVSize; i < aSize; ++i) {
 519 +    *aBlock++ *= *aScale++;
 520 +  }
 521 +};
 522 +
 523 +template <class Arch>
 524 +void Engine<Arch>::AudioBlockPanStereoToStereo(
 525 +    const float aInputL[WEBAUDIO_BLOCK_SIZE],
 526 +    const float aInputR[WEBAUDIO_BLOCK_SIZE], float aGainL, float aGainR,
 527 +    bool aIsOnTheLeft, float aOutputL[WEBAUDIO_BLOCK_SIZE],
 528 +    float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
 529 +  MOZ_ASSERT(is_aligned<Arch>(aInputL), "aInputL is aligned");
 530 +  MOZ_ASSERT(is_aligned<Arch>(aInputR), "aInputR is aligned");
 531 +  MOZ_ASSERT(is_aligned<Arch>(aOutputL), "aOutputL is aligned");
 532 +  MOZ_ASSERT(is_aligned<Arch>(aOutputR), "aOutputR is aligned");
 533 +
 534 +  MOZ_ASSERT((WEBAUDIO_BLOCK_SIZE % xsimd::batch<float, Arch>::size == 0),
 535 +             "requires tail processing");
 536 +
 537 +  xsimd::batch<float, Arch> vgainl(aGainL);
 538 +  xsimd::batch<float, Arch> vgainr(aGainR);
 539 +
 540 +  if (aIsOnTheLeft) {
 541 +    MOZ_UNROLL(2)
 542 +    for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
 543 +         i += xsimd::batch<float, Arch>::size) {
 544 +      auto vinl = xsimd::batch<float, Arch>::load_aligned(&aInputL[i]);
 545 +      auto vinr = xsimd::batch<float, Arch>::load_aligned(&aInputR[i]);
 546 +
 547 +      /* left channel : aOutputL  = aInputL + aInputR * gainL */
 548 +      auto vout = xsimd::fma(vinr, vgainl, vinl);
 549 +      vout.store_aligned(&aOutputL[i]);
 550 +
 551 +      /* right channel : aOutputR = aInputR * gainR */
 552 +      auto vscaled = vinr * vgainr;
 553 +      vscaled.store_aligned(&aOutputR[i]);
 554 +    }
 555 +  } else {
 556 +    MOZ_UNROLL(2)
 557 +    for (unsigned i = 0; i < WEBAUDIO_BLOCK_SIZE;
 558 +         i += xsimd::batch<float, Arch>::size) {
 559 +      auto vinl = xsimd::batch<float, Arch>::load_aligned(&aInputL[i]);
 560 +      auto vinr = xsimd::batch<float, Arch>::load_aligned(&aInputR[i]);
 561 +
 562 +      /* left channel : aInputL * gainL */
 563 +      auto vscaled = vinl * vgainl;
 564 +      vscaled.store_aligned(&aOutputL[i]);
 565 +
 566 +      /* right channel: aOutputR = aInputR + aInputL * gainR */
 567 +      auto vout = xsimd::fma(vinl, vgainr, vinr);
 568 +      vout.store_aligned(&aOutputR[i]);
 569 +    }
 570 +  }
 571 +};
 572 +
 573 +template <class Arch>
 574 +void Engine<Arch>::BufferComplexMultiply(const float* aInput,
 575 +                                         const float* aScale, float* aOutput,
 576 +                                         uint32_t aSize) {
 577 +  MOZ_ASSERT(is_aligned<Arch>(aInput), "aInput is aligned");
 578 +  MOZ_ASSERT(is_aligned<Arch>(aOutput), "aOutput is aligned");
 579 +  MOZ_ASSERT(is_aligned<Arch>(aScale), "aScale is aligned");
 580 +  MOZ_ASSERT((aSize % xsimd::batch<float, Arch>::size == 0),
 581 +             "requires tail processing");
 582 +
 583 +  MOZ_UNROLL(2)
 584 +  for (unsigned i = 0; i < aSize * 2;
 585 +       i += 2 * xsimd::batch<std::complex<float>, Arch>::size) {
 586 +    auto in1 = xsimd::batch<std::complex<float>, Arch>::load_aligned(
 587 +        reinterpret_cast<const std::complex<float>*>(&aInput[i]));
 588 +    auto in2 = xsimd::batch<std::complex<float>, Arch>::load_aligned(
 589 +        reinterpret_cast<const std::complex<float>*>(&aScale[i]));
 590 +    auto out = in1 * in2;
 591 +    out.store_aligned(reinterpret_cast<std::complex<float>*>(&aOutput[i]));
 592 +  }
 593 +};
 594 +
 595 +template <class Arch>
 596 +float Engine<Arch>::AudioBufferSumOfSquares(const float* aInput,
 597 +                                            uint32_t aLength) {
 598 +  float sum = 0.f;
 599 +
 600 +  if constexpr (Arch::requires_alignment()) {
 601 +    while (!is_aligned<Arch>(aInput)) {
 602 +      if (!aLength) {
 603 +        return sum;
 604 +      }
 605 +      sum += *aInput * *aInput;
 606 +      ++aInput;
 607 +      --aLength;
 608 +    }
 609 +  }
 610 +
 611 +  MOZ_ASSERT(is_aligned<Arch>(aInput), "aInput is aligned");
 612 +
 613 +  constexpr uint32_t unroll_factor = 4;
 614 +  xsimd::batch<float, Arch> accs[unroll_factor] = {0.f, 0.f, 0.f, 0.f};
 615 +
 616 +  uint32_t vLength =
 617 +      aLength & ~(unroll_factor * xsimd::batch<float, Arch>::size - 1);
 618 +
 619 +  for (uint32_t i = 0; i < vLength;
 620 +       i += unroll_factor * xsimd::batch<float, Arch>::size) {
 621 +    MOZ_UNROLL(4)
 622 +    for (uint32_t j = 0; j < unroll_factor; ++j) {
 623 +      auto in = xsimd::batch<float, Arch>::load_aligned(
 624 +          &aInput[i + xsimd::batch<float, Arch>::size * j]);
 625 +      accs[j] = xsimd::fma(in, in, accs[j]);
 626 +    }
 627 +  }
 628 +
 629 +  sum += reduce_add((accs[0] + accs[1]) + (accs[2] + accs[3]));
 630 +  for (uint32_t i = vLength; i < aLength; ++i) sum += aInput[i] * aInput[i];
 631 +  return sum;
 632 +};
 633 +
 634 +template <class Arch>
 635 +void Engine<Arch>::NaNToZeroInPlace(float* aSamples, size_t aCount) {
 636 +  if constexpr (Arch::requires_alignment()) {
 637 +    while (!is_aligned<Arch>(aSamples)) {
 638 +      if (!aCount) {
 639 +        return;
 640 +      }
 641 +      if (*aSamples != *aSamples) {
 642 +        *aSamples = 0.0;
 643 +      }
 644 +      ++aSamples;
 645 +      --aCount;
 646 +    }
 647 +  }
 648 +
 649 +  MOZ_ASSERT(is_aligned<Arch>(aSamples), "aSamples is aligned");
 650 +
 651 +  uint32_t vCount = aCount & ~(xsimd::batch<float, Arch>::size - 1);
 652 +
 653 +  MOZ_UNROLL(4)
 654 +  for (uint32_t i = 0; i < vCount; i += xsimd::batch<float, Arch>::size) {
 655 +    auto vin = xsimd::batch<float, Arch>::load_aligned(&aSamples[i]);
 656 +    auto vout =
 657 +        xsimd::select(xsimd::isnan(vin), xsimd::batch<float, Arch>(0.f), vin);
 658 +    vout.store_aligned(&aSamples[i]);
 659 +  }
 660 +
 661 +  for (uint32_t i = vCount; i < aCount; i++) {
 662 +    if (aSamples[i] != aSamples[i]) {
 663 +      aSamples[i] = 0.0;
 664 +    }
 665 +  }
 666 +};
 667 +
 668 +template <class Arch>
 669 +void Engine<Arch>::AudioBlockPanStereoToStereo(
 670 +    const float aInputL[WEBAUDIO_BLOCK_SIZE],
 671 +    const float aInputR[WEBAUDIO_BLOCK_SIZE],
 672 +    const float aGainL[WEBAUDIO_BLOCK_SIZE],
 673 +    const float aGainR[WEBAUDIO_BLOCK_SIZE],
 674 +    const bool aIsOnTheLeft[WEBAUDIO_BLOCK_SIZE],
 675 +    float aOutputL[WEBAUDIO_BLOCK_SIZE], float aOutputR[WEBAUDIO_BLOCK_SIZE]) {
 676 +  MOZ_ASSERT(is_aligned<Arch>(aInputL), "aInputL is aligned");
 677 +  MOZ_ASSERT(is_aligned<Arch>(aInputR), "aInputR is aligned");
 678 +  MOZ_ASSERT(is_aligned<Arch>(aGainL), "aGainL is aligned");
 679 +  MOZ_ASSERT(is_aligned<Arch>(aGainR), "aGainR is aligned");
 680 +  MOZ_ASSERT(is_aligned<Arch>(aIsOnTheLeft), "aIsOnTheLeft is aligned");
 681 +  MOZ_ASSERT(is_aligned<Arch>(aOutputL), "aOutputL is aligned");
 682 +  MOZ_ASSERT(is_aligned<Arch>(aOutputR), "aOutputR is aligned");
 683 +
 684 +  MOZ_ASSERT((WEBAUDIO_BLOCK_SIZE % xsimd::batch<float, Arch>::size == 0),
 685 +             "requires tail processing");
 686 +
 687 +  MOZ_UNROLL(2)
 688 +  for (uint32_t i = 0; i < WEBAUDIO_BLOCK_SIZE;
 689 +       i += xsimd::batch<float, Arch>::size) {
 690 +    auto mask = xsimd::batch_bool<float, Arch>::load_aligned(&aIsOnTheLeft[i]);
 691 +
 692 +    auto inputL = xsimd::batch<float, Arch>::load_aligned(&aInputL[i]);
 693 +    auto inputR = xsimd::batch<float, Arch>::load_aligned(&aInputR[i]);
 694 +    auto gainL = xsimd::batch<float, Arch>::load_aligned(&aGainL[i]);
 695 +    auto gainR = xsimd::batch<float, Arch>::load_aligned(&aGainR[i]);
 696 +
 697 +    auto outL_true = xsimd::fma(inputR, gainL, inputL);
 698 +    auto outR_true = inputR * gainR;
 699 +
 700 +    auto outL_false = inputL * gainL;
 701 +    auto outR_false = xsimd::fma(inputL, gainR, inputR);
 702 +
 703 +    auto outL = xsimd::select(mask, outL_true, outL_false);
 704 +    auto outR = xsimd::select(mask, outR_true, outR_false);
 705 +
 706 +    outL.store_aligned(&aOutputL[i]);
 707 +    outR.store_aligned(&aOutputR[i]);
 708 +  }
 709 +}
 710 +
 711 +}  // namespace mozilla
 712 +
 713 +#endif
 714 diff --git a/dom/media/webaudio/AudioNodeEngineNEON.cpp b/dom/media/webaudio/AudioNodeEngineNEON.cpp
 715 --- a/dom/media/webaudio/AudioNodeEngineNEON.cpp
 716 +++ b/dom/media/webaudio/AudioNodeEngineNEON.cpp
 717 @@ -1,9 +1,9 @@
 718  /* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 719  /* this source code form is subject to the terms of the mozilla public
 720   * license, v. 2.0. if a copy of the mpl was not distributed with this file,
 721   * You can obtain one at http://mozilla.org/MPL/2.0/. */
 722
 723 -#include "AudioNodeEngineGeneric.h"
 724 +#include "AudioNodeEngineGenericImpl.h"
 725  namespace mozilla {
 726  template struct Engine<xsimd::neon>;
 727  }  // namespace mozilla
 728 diff --git a/dom/media/webaudio/AudioNodeEngineSSE2.cpp b/dom/media/webaudio/AudioNodeEngineSSE2.cpp
 729 --- a/dom/media/webaudio/AudioNodeEngineSSE2.cpp
 730 +++ b/dom/media/webaudio/AudioNodeEngineSSE2.cpp
 731 @@ -1,10 +1,10 @@
 732  /* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 733  /* this source code form is subject to the terms of the mozilla public
 734   * license, v. 2.0. if a copy of the mpl was not distributed with this file,
 735   * You can obtain one at http://mozilla.org/MPL/2.0/. */
 736
 737 -#include "AudioNodeEngineGeneric.h"
 738 +#include "AudioNodeEngineGenericImpl.h"
 739
 740  namespace mozilla {
 741  template struct Engine<xsimd::sse2>;
 742  }  // namespace mozilla
 743 diff --git a/dom/media/webaudio/AudioNodeEngineSSE4_2_FMA3.cpp b/dom/media/webaudio/AudioNodeEngineSSE4_2_FMA3.cpp
 744 --- a/dom/media/webaudio/AudioNodeEngineSSE4_2_FMA3.cpp
 745 +++ b/dom/media/webaudio/AudioNodeEngineSSE4_2_FMA3.cpp
 746 @@ -1,10 +1,10 @@
 747  /* -*- mode: c++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
 748  /* this source code form is subject to the terms of the mozilla public
 749   * license, v. 2.0. if a copy of the mpl was not distributed with this file,
 750   * You can obtain one at http://mozilla.org/MPL/2.0/. */
 751
 752 -#include "AudioNodeEngineGeneric.h"
 753 +#include "AudioNodeEngineGenericImpl.h"
 754
 755  namespace mozilla {
 756  template struct Engine<xsimd::fma3<xsimd::sse4_2>>;
 757  }  // namespace mozilla
 758