/* 
A benchmark of 8-bit vs 16-bit blending for ColorFromPalette when
filling 2,000 LEDs on AVR.
  * on the left: 8-bit indexing
  * on the right: 16-bit indexing
  * uncomment one of the `TYPE_TO_TEST` macros to choose amongst the contenders

Results:
  * 16-colour RGB palettes
    * 12.5ms vs 16.7ms = 35% slower
      * worst case. The awkward bit-shift amounts really penalise AVR here.
  * 32-colour RGB palettes
    * 11.1ms vs 13.6ms = 23% slower
      * the bit-shifts are slightly more amenable to the AVR instruction
        set, so both versions are a little faster than the 16-colour versions.
  * 256-colour RGB palettes
    * 4.7ms vs 11.3ms = 136% slower
      * The 8-bit variant is twice the speed of any other variant. The
        code doesn't do any blending, it simply picks a colour directly
        from the 256-colour palette.
        The 16-bit variant is the fastest of the 16-bit contenders as it needs
        no bit-shifting.

See:
  * https://github.com/FastLED/FastLED/pull/202
  * https://github.com/FastLED/FastLED/pull/1687

*/

#define TYPE_TO_TEST pal16
// #define TYPE_TO_TEST pal32
// #define TYPE_TO_TEST pal256

#include <FastLED.h>
#include "ColorFromPalette16bit.h"


#define kMatrixWidth 40
#define kMatrixHeight 50
#define NUM_LEDS ((kMatrixWidth) * (kMatrixHeight))

CRGB leds[NUM_LEDS];
CRGBPalette16 pal16 = RainbowStripeColors_p;
CRGBPalette32 pal32;
CRGBPalette256 pal256;

void setup() {
  FastLED.addLeds<WS2812B, 3, GRB>(leds, NUM_LEDS);
  FastLED.addLeds<WS2812B, 4, GRB>(leds, NUM_LEDS);
  Serial.begin(2000000);
  // stretch the 16-colour palette to fill the 32-colour palette
  for (int index = 0; index < 32; index++) {
    pal32[index] = ColorFromPalette(pal16, (uint16_t)(index << 11), 255, LINEARBLEND);
  }
  // stretch the 16-colour palette to fill the 256-colour palette
  for (int index = 0; index < 256; index++) {
    pal256[index] = ColorFromPalette(pal16, (uint16_t)(index << 8), 255, LINEARBLEND);
  }
}

void loop() {
  const int iterations = 1;
  static uint16_t indexstart = 0;
  indexstart += 127;

  uint32_t orig_us1 = micros();
  for (uint32_t i = 0; i < iterations; i++) {
    uint16_t index = indexstart;
    for (uint16_t ledno = 0; ledno < NUM_LEDS; ledno++) {
      leds[ledno] = ColorFromPalette(TYPE_TO_TEST, (uint8_t) (index >> 8), 255, LINEARBLEND);
      index += 9;
    }
  }
  uint32_t orig_us2 = micros();
  FastLED[0].showLeds();

  uint32_t fix_us1 = micros();
  for (uint32_t i = 0; i < iterations; i++) {
    uint16_t index = indexstart;
    for (uint16_t ledno = 0; ledno < NUM_LEDS; ledno++) {
      leds[ledno] = ColorFromPalette(TYPE_TO_TEST, index, 255, LINEARBLEND);
      index += 9;
    }
  }
  uint32_t fix_us2 = micros();
  FastLED[1].showLeds();

  Serial.print(float(orig_us2 - orig_us1) / iterations);
  Serial.print("μs vs ");
  Serial.print(float(fix_us2 - fix_us1) / iterations);
  Serial.print("μs  % change: ");
  Serial.println(float(fix_us2 - fix_us1) / float(orig_us2 - orig_us1) * 100.f - 100.f);
}