diff --git a/CMakeLists.txt b/CMakeLists.txt
index e3d871c3..fbeeefb4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -8,6 +8,7 @@ option(OPT_BUILD_SPYSERVER_SOURCE "Build SpyServer Source Module (no dependencie
 option(OPT_BUILD_SOAPY_SOURCE "Build SoapySDR Source Module (Depedencies: soapysdr)" ON)
 option(OPT_BUILD_AIRSPYHF_SOURCE "Build Airspy HF+ Source Module (Depedencies: libairspyhf)" ON)
 option(OPT_BUILD_AIRSPY_SOURCE "Build Airspy Source Module (Depedencies: libairspy)" ON)
+option(OPT_BUILD_BLADERF_SOURCE "Build BladeRF Source Module (Depedencies: libbladeRF)" OFF)
 option(OPT_BUILD_SDRPLAY_SOURCE "Build SDRplay Source Module (Depedencies: libsdrplay)" OFF)
 option(OPT_BUILD_PLUTOSDR_SOURCE "Build PlutoSDR Source Module (Depedencies: libiio, libad9361)" ON)
 option(OPT_BUILD_HACKRF_SOURCE "Build HackRF Source Module (Depedencies: libhackrf)" OFF)
@@ -20,6 +21,7 @@ add_subdirectory("core")
 add_subdirectory("radio")
 add_subdirectory("recorder") 
 add_subdirectory("file_source")
+add_subdirectory("falcon9_decoder")
 
 # Source modules
 if (OPT_BUILD_RTL_TCP_SOURCE)
@@ -42,6 +44,10 @@ if (OPT_BUILD_AIRSPY_SOURCE)
 add_subdirectory("airspy_source")
 endif (OPT_BUILD_AIRSPY_SOURCE)
 
+if (OPT_BUILD_BLADERF_SOURCE)
+add_subdirectory("bladerf_source")
+endif(OPT_BUILD_BLADERF_SOURCE)
+
 if (OPT_BUILD_SDRPLAY_SOURCE)
 add_subdirectory("sdrplay_source")
 endif (OPT_BUILD_SDRPLAY_SOURCE)
@@ -95,7 +101,7 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
     add_custom_target(do_always ALL cp \"$<TARGET_FILE_DIR:sdrpp_core>/libsdrpp_core.dylib\" \"$<TARGET_FILE_DIR:sdrpp>\")
 endif ()
 
-# cmake .. "-DCMAKE_TOOLCHAIN_FILE=C:/Users/Alex/vcpkg/scripts/buildsystems/vcpkg.cmake" -G "Visual Studio 15 2017 Win64"
+# cd
 
 # Install directives
 install(TARGETS sdrpp DESTINATION bin)
diff --git a/bladerf_source/CMakeLists.txt b/bladerf_source/CMakeLists.txt
new file mode 100644
index 00000000..c8b6765a
--- /dev/null
+++ b/bladerf_source/CMakeLists.txt
@@ -0,0 +1,29 @@
+cmake_minimum_required(VERSION 3.13)
+project(bladerf_source)
+
+if (MSVC)
+    set(CMAKE_CXX_FLAGS "-O2 /std:c++17 /EHsc")
+else()
+    set(CMAKE_CXX_FLAGS "-O3 -std=c++17 -fpermissive")
+endif (MSVC)
+
+include_directories("src/")
+
+file(GLOB SRC "src/*.cpp")
+
+add_library(bladerf_source SHARED ${SRC})
+target_link_libraries(bladerf_source PRIVATE sdrpp_core)
+set_target_properties(bladerf_source PROPERTIES PREFIX "")
+
+if (MSVC)
+    # Lib path
+    target_link_directories(sdrpp_core PUBLIC "C:/Program Files/PothosSDR/bin/")
+
+    target_link_libraries(bladerf_source PUBLIC bladeRF)
+else (MSVC)
+    # Not in pkg-config
+    target_link_libraries(bladerf_source PUBLIC libbladeRF)
+endif (MSVC)
+
+# Install directives
+install(TARGETS bladerf_source DESTINATION lib/sdrpp/plugins)
\ No newline at end of file
diff --git a/bladerf_source/src/main.cpp b/bladerf_source/src/main.cpp
new file mode 100644
index 00000000..49cd3c62
--- /dev/null
+++ b/bladerf_source/src/main.cpp
@@ -0,0 +1,339 @@
+#include <imgui.h>
+#include <spdlog/spdlog.h>
+#include <module.h>
+#include <gui/gui.h>
+#include <signal_path/signal_path.h>
+#include <core.h>
+#include <gui/style.h>
+#include <config.h>
+#include <options.h>
+#include <gui/widgets/stepped_slider.h>
+#include <libbladeRF.h>
+#include <dsp/processing.h>
+
+#define CONCAT(a, b) ((std::string(a) + b).c_str())
+
+#define NUM_BUFFERS     128
+#define NUM_TRANSFERS   32
+
+SDRPP_MOD_INFO {
+    /* Name:            */ "bladerf_source",
+    /* Description:     */ "BladeRF source module for SDR++",
+    /* Author:          */ "Ryzerth",
+    /* Version:         */ 0, 1, 0,
+    /* Max instances    */ 1
+};
+
+const uint64_t sampleRates[] = {
+    520834,
+    1000000,
+    2000000,
+    4000000,
+    5000000,
+    8000000,
+    10000000,
+    15000000,
+    20000000,
+    25000000,
+    30000000,
+    35000000,
+    40000000,
+    45000000,
+    50000000,
+    55000000,
+    61440000
+};
+
+const char* sampleRatesTxt = 
+    "520.834KHz\0"
+    "1MHz\0"
+    "2MHz\0"
+    "4MHz\0"
+    "5MHz\0"
+    "8MHz\0"
+    "10MHz\0"
+    "15MHz\0"
+    "20MHz\0"
+    "25MHz\0"
+    "30MHz\0"
+    "35MHz\0"
+    "40MHz\0"
+    "45MHz\0"
+    "50MHz\0"
+    "55MHz\0"
+    "61.44MHz\0";
+
+ConfigManager config;
+
+class BladeRFSourceModule : public ModuleManager::Instance {
+public:
+    BladeRFSourceModule(std::string name) {
+        this->name = name;
+
+        sampleRate = 768000.0;
+
+        handler.ctx = this;
+        handler.selectHandler = menuSelected;
+        handler.deselectHandler = menuDeselected;
+        handler.menuHandler = menuHandler;
+        handler.startHandler = start;
+        handler.stopHandler = stop;
+        handler.tuneHandler = tune;
+        handler.stream = &stream;
+
+        refresh();
+        selectFirst();
+
+        // Select device here
+        core::setInputSampleRate(sampleRate);
+
+        sigpath::sourceManager.registerSource("BladeRF", &handler);
+    }
+
+    ~BladeRFSourceModule() {
+        
+    }
+
+    void enable() {
+        enabled = true;
+    }
+
+    void disable() {
+        enabled = false;
+    }
+
+    bool isEnabled() {
+        return enabled;
+    }
+
+    void refresh() {
+        devListTxt = "";
+
+        if (devInfoList != NULL) {
+            bladerf_free_device_list(devInfoList);
+        }
+
+        devCount = bladerf_get_device_list(&devInfoList);
+        if (devCount < 0) {
+            spdlog::error("Could not list devices");
+            return;
+        }
+        for (int i = 0; i < devCount; i++) {
+            devListTxt += devInfoList[i].serial;
+            devListTxt += '\0';
+        }
+    }
+
+    void selectFirst() {
+        if (devCount > 0) { selectByInfo(&devInfoList[0]); }
+    }
+
+    void selectByInfo(bladerf_devinfo* info) {
+        int ret = bladerf_open_with_devinfo(&openDev, info);
+        if (ret != 0) {
+            spdlog::error("Could not open device {0}", info->serial);
+            return;
+        }
+
+        channelCount = bladerf_get_channel_count(openDev, BLADERF_RX);
+
+        // TODO: Gen sample rate list automatically by detecting which version is selected
+
+        bladerf_close(openDev);
+    }
+
+private:
+    std::string getBandwdithScaled(double bw) {
+        char buf[1024];
+        if (bw >= 1000000.0) {
+            sprintf(buf, "%.1lfMHz", bw / 1000000.0);
+        }
+        else if (bw >= 1000.0) {
+            sprintf(buf, "%.1lfKHz", bw / 1000.0);
+        }
+        else {
+            sprintf(buf, "%.1lfHz", bw);
+        }
+        return std::string(buf);
+    }
+
+    static void menuSelected(void* ctx) {
+        BladeRFSourceModule* _this = (BladeRFSourceModule*)ctx;
+        core::setInputSampleRate(_this->sampleRate);
+        spdlog::info("BladeRFSourceModule '{0}': Menu Select!", _this->name);
+    }
+
+    static void menuDeselected(void* ctx) {
+        BladeRFSourceModule* _this = (BladeRFSourceModule*)ctx;
+        spdlog::info("BladeRFSourceModule '{0}': Menu Deselect!", _this->name);
+    }
+    
+    static void start(void* ctx) {
+        BladeRFSourceModule* _this = (BladeRFSourceModule*)ctx;
+        if (_this->running) {
+            return;
+        }
+        if (_this->devCount == 0) { return; }
+
+        // Open device
+        bladerf_devinfo info = _this->devInfoList[_this->devId];
+        int ret = bladerf_open_with_devinfo(&_this->openDev, &info);
+        if (ret != 0) {
+            spdlog::error("Could not open device {0}", info.serial);
+
+            
+            return;
+        }
+
+        bladerf_sample_rate wantedSr = _this->sampleRate;
+        bladerf_sample_rate actualSr;
+        bladerf_set_sample_rate(_this->openDev, BLADERF_CHANNEL_RX(0), wantedSr, &actualSr);
+        bladerf_set_frequency(_this->openDev, BLADERF_CHANNEL_RX(0), _this->freq);
+
+        if (actualSr != wantedSr) {
+            spdlog::warn("Sample rate rejected: {0} vs {1}", actualSr, wantedSr);
+            return;
+        }
+
+        // Start stream
+        ret = bladerf_init_stream(&_this->rxStream, _this->openDev, callback, &_this->streamBuffers, NUM_BUFFERS, BLADERF_FORMAT_SC16_Q11, 8192, NUM_TRANSFERS, _this);
+        if (ret != 0) {
+            spdlog::error("Could not start stream {0}", ret);
+            return;
+        }
+
+        bladerf_enable_module(_this->openDev, BLADERF_CHANNEL_RX(0), true);
+
+        _this->running = true;
+        _this->workerThread = std::thread(&BladeRFSourceModule::worker, _this);
+
+        spdlog::info("BladeRFSourceModule '{0}': Start!", _this->name);
+    }
+    
+    static void stop(void* ctx) {
+        BladeRFSourceModule* _this = (BladeRFSourceModule*)ctx;
+        if (!_this->running) {
+            return;
+        }
+        _this->running = false;
+        _this->stream.stopWriter();
+        
+        bladerf_enable_module(_this->openDev, BLADERF_CHANNEL_RX(0), false);
+        if (_this->workerThread.joinable()) {
+            _this->workerThread.join();
+        }
+
+        bladerf_close(_this->openDev);
+
+        _this->stream.clearWriteStop();
+        spdlog::info("BladeRFSourceModule '{0}': Stop!", _this->name);
+    }
+    
+    static void tune(double freq, void* ctx) {
+        BladeRFSourceModule* _this = (BladeRFSourceModule*)ctx;
+        _this->freq = freq;
+        if (_this->running) {
+            bladerf_set_frequency(_this->openDev, BLADERF_CHANNEL_RX(0), _this->freq);
+        }
+        spdlog::info("BladeRFSourceModule '{0}': Tune: {1}!", _this->name, freq);
+    }
+    
+    static void menuHandler(void* ctx) {
+        BladeRFSourceModule* _this = (BladeRFSourceModule*)ctx;
+        float menuWidth = ImGui::GetContentRegionAvailWidth();
+
+        if (_this->running) { style::beginDisabled(); }
+
+        ImGui::SetNextItemWidth(menuWidth);
+        if (ImGui::Combo(CONCAT("##_airspyhf_dev_sel_", _this->name), &_this->devId, _this->devListTxt.c_str())) {
+            // Select device
+            core::setInputSampleRate(_this->sampleRate);
+            // Save config
+        }
+
+        if (ImGui::Combo(CONCAT("##_airspyhf_sr_sel_", _this->name), &_this->srId, sampleRatesTxt)) {
+            _this->sampleRate = sampleRates[_this->srId];
+            core::setInputSampleRate(_this->sampleRate);
+            // Save config
+        }
+
+        ImGui::SameLine();
+        float refreshBtnWdith = menuWidth - ImGui::GetCursorPosX();
+        if (ImGui::Button(CONCAT("Refresh##_airspyhf_refr_", _this->name), ImVec2(refreshBtnWdith, 0))) {
+            _this->refresh();
+            config.aquire();
+            std::string devSerial = config.conf["device"];
+            config.release();
+            // Reselect device
+            core::setInputSampleRate(_this->sampleRate);
+        }
+
+        if (_this->running) { style::endDisabled(); }
+
+        // General config BS
+
+    }
+
+    void worker() {
+        bladerf_stream(rxStream, BLADERF_RX_X1);
+    }
+
+    static void* callback(struct bladerf *dev, struct bladerf_stream *stream, struct bladerf_metadata *meta, void *samples, size_t num_samples, void *user_data) {
+        // TODO: Convert with volk
+        BladeRFSourceModule* _this = (BladeRFSourceModule*)user_data;
+        int16_t* samples16 = (int16_t*)samples;
+        _this->currentBuffer = ((_this->currentBuffer + 1) % NUM_BUFFERS);
+        for (size_t i = 0; i < num_samples; i++) {
+            _this->stream.writeBuf[i].i = (float)samples16[(2 * i)] / 32768.0f;
+            _this->stream.writeBuf[i].q = (float)samples16[(2 * i) + 1] / 32768.0f;
+            if (!_this->stream.swap(num_samples)) { return _this->streamBuffers[_this->currentBuffer];; }
+        }
+
+        return _this->streamBuffers[_this->currentBuffer];
+    }
+
+    std::string name;
+    bladerf* openDev;
+    bool enabled = true;
+    dsp::stream<dsp::complex_t> stream;
+    //dsp::Packer<dsp::complex_t> packer(&steam, 2048);
+    double sampleRate;
+    SourceManager::SourceHandler handler;
+    bool running = false;
+    double freq;
+    int devId = 0;
+    int srId = 0;
+
+    int channelCount = 0;
+    int currentBuffer = 0;
+    void** streamBuffers;
+    struct bladerf_stream* rxStream;
+
+    std::thread workerThread;
+
+    int devCount = 0;
+    bladerf_devinfo* devInfoList = NULL;
+    std::string devListTxt;
+};
+
+MOD_EXPORT void _INIT_() {
+    json def = json({});
+    def["devices"] = json({});
+    def["device"] = "";
+    config.setPath(options::opts.root + "/bladerf_config.json");
+    config.load(def);
+    config.enableAutoSave();
+}
+
+MOD_EXPORT ModuleManager::Instance* _CREATE_INSTANCE_(std::string name) {
+    return new BladeRFSourceModule(name);
+}
+
+MOD_EXPORT void _DELETE_INSTANCE_(ModuleManager::Instance* instance) {
+    delete (BladeRFSourceModule*)instance;
+}
+
+MOD_EXPORT void _END_() {
+    config.disableAutoSave();
+    config.save();
+}
\ No newline at end of file
diff --git a/core/src/dsp/deframing.h b/core/src/dsp/deframing.h
new file mode 100644
index 00000000..642d77a1
--- /dev/null
+++ b/core/src/dsp/deframing.h
@@ -0,0 +1,114 @@
+#pragma once
+#include <dsp/block.h>
+#include <inttypes.h>
+
+#define DSP_SIGN(n)     ((n) >= 0)
+#define DSP_STEP(n)     (((n) > 0.0f) ? 1.0f : -1.0f)
+
+namespace dsp {
+    class Deframer : public generic_block<Deframer> {
+    public:
+        Deframer() {}
+
+        Deframer(stream<uint8_t>* in, int frameLen, uint8_t* syncWord, int syncLen) { init(in, frameLen, syncWord, syncLen); }
+
+        ~Deframer() {
+            generic_block<Deframer>::stop();
+        }
+
+        void init(stream<uint8_t>* in, int frameLen, uint8_t* syncWord, int syncLen) {
+            _in = in;
+            _frameLen = frameLen;
+            _syncword = new  uint8_t[syncLen];
+            _syncLen = syncLen;
+            memcpy(_syncword, syncWord, syncLen);
+
+            buffer = new uint8_t[STREAM_BUFFER_SIZE + syncLen];
+            memset(buffer, 0, syncLen);
+            bufferStart = buffer + syncLen;
+            
+            generic_block<Deframer>::registerInput(_in);
+            generic_block<Deframer>::registerOutput(&out);
+        }
+
+        int run() {
+            count = _in->read();
+            if (count < 0) { return -1; }
+
+
+            // Copy data into work buffer
+            memcpy(bufferStart, _in->readBuf, count - 1);
+
+            // Iterate through all symbols
+            for (int i = 0; i < count;) {
+
+                // If already in the process of reading bits 
+                if (bitsRead >= 0) {
+                    if ((bitsRead % 8) == 0) { out.writeBuf[bitsRead / 8] = 0; }
+                    out.writeBuf[bitsRead / 8] |= (buffer[i] << (7 - (bitsRead % 8)));
+                    i++;
+                    bitsRead++;
+
+                    if (bitsRead >= _frameLen) {
+                        if (!out.swap((bitsRead / 8) + ((bitsRead % 8) > 0))) { return -1; }
+                        bitsRead = -1;
+                        nextBitIsStartOfFrame = true;
+                    }
+
+                    continue;
+                }
+
+                // Else, check for a header
+                else if (memcmp(buffer + i, _syncword, _syncLen) == 0) {
+                    bitsRead = 0;
+                    badFrameCount = 0;
+                    continue;
+                }
+                else if (nextBitIsStartOfFrame) {
+                    nextBitIsStartOfFrame = false;
+
+                    // try to save 
+                    if (badFrameCount < 5) {
+                        badFrameCount++;
+                        bitsRead = 0;
+                        continue;
+                    }
+
+                }
+
+                else { i++; }
+
+                nextBitIsStartOfFrame = false;
+
+            }
+
+            // Keep last _syncLen4 symbols
+            memcpy(buffer, &_in->readBuf[count - _syncLen], _syncLen);
+            
+            //printf("Block processed\n");
+            callcount++;
+
+            _in->flush();
+            return count;
+        }
+
+        stream<uint8_t> out;
+
+    private:
+        uint8_t* buffer;
+        uint8_t* bufferStart;
+        uint8_t* _syncword;
+        int count;
+        int _frameLen;
+        int _syncLen;
+        int bitsRead = -1;
+
+        int badFrameCount = 5;
+        bool nextBitIsStartOfFrame = false;
+
+        int callcount = 0;
+        
+        stream<uint8_t>* _in;
+
+    };
+}
\ No newline at end of file
diff --git a/core/src/dsp/demodulator.h b/core/src/dsp/demodulator.h
index fc51a47a..91dc3cde 100644
--- a/core/src/dsp/demodulator.h
+++ b/core/src/dsp/demodulator.h
@@ -3,7 +3,7 @@
 #include <volk/volk.h>
 #include <dsp/filter.h>
 #include <dsp/processing.h>
-
+#include <dsp/routing.h>
 #include <spdlog/spdlog.h>
 
 #define FAST_ATAN2_COEF1    FL_M_PI / 4.0f
diff --git a/core/src/dsp/interpolation_taps.h b/core/src/dsp/interpolation_taps.h
new file mode 100644
index 00000000..5550846c
--- /dev/null
+++ b/core/src/dsp/interpolation_taps.h
@@ -0,0 +1,136 @@
+#pragma once
+
+const int INTERP_TAP_COUNT = 8;
+const int INTERP_STEPS = 128;
+
+const float INTERP_TAPS[INTERP_STEPS + 1][INTERP_TAP_COUNT] = {
+    { 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00 },
+    { -1.98993e-04, 1.24642e-03, -5.41054e-03, 9.98534e-01, 7.89295e-03, -2.76968e-03, 8.53777e-04, -1.54700e-04 },
+    { -3.96391e-04, 2.47942e-03, -1.07209e-02, 9.96891e-01, 1.58840e-02, -5.55134e-03, 1.70888e-03, -3.09412e-04 },
+    { -5.92100e-04, 3.69852e-03, -1.59305e-02, 9.95074e-01, 2.39714e-02, -8.34364e-03, 2.56486e-03, -4.64053e-04 },
+    { -7.86031e-04, 4.90322e-03, -2.10389e-02, 9.93082e-01, 3.21531e-02, -1.11453e-02, 3.42130e-03, -6.18544e-04 },
+    { -9.78093e-04, 6.09305e-03, -2.60456e-02, 9.90917e-01, 4.04274e-02, -1.39548e-02, 4.27773e-03, -7.72802e-04 },
+    { -1.16820e-03, 7.26755e-03, -3.09503e-02, 9.88580e-01, 4.87921e-02, -1.67710e-02, 5.13372e-03, -9.26747e-04 },
+    { -1.35627e-03, 8.42626e-03, -3.57525e-02, 9.86071e-01, 5.72454e-02, -1.95925e-02, 5.98883e-03, -1.08030e-03 },
+    { -1.54221e-03, 9.56876e-03, -4.04519e-02, 9.83392e-01, 6.57852e-02, -2.24178e-02, 6.84261e-03, -1.23337e-03 },
+    { -1.72594e-03, 1.06946e-02, -4.50483e-02, 9.80543e-01, 7.44095e-02, -2.52457e-02, 7.69462e-03, -1.38589e-03 },
+    { -1.90738e-03, 1.18034e-02, -4.95412e-02, 9.77526e-01, 8.31162e-02, -2.80746e-02, 8.54441e-03, -1.53777e-03 },
+    { -2.08645e-03, 1.28947e-02, -5.39305e-02, 9.74342e-01, 9.19033e-02, -3.09033e-02, 9.39154e-03, -1.68894e-03 },
+    { -2.26307e-03, 1.39681e-02, -5.82159e-02, 9.70992e-01, 1.00769e-01, -3.37303e-02, 1.02356e-02, -1.83931e-03 },
+    { -2.43718e-03, 1.50233e-02, -6.23972e-02, 9.67477e-01, 1.09710e-01, -3.65541e-02, 1.10760e-02, -1.98880e-03 },
+    { -2.60868e-03, 1.60599e-02, -6.64743e-02, 9.63798e-01, 1.18725e-01, -3.93735e-02, 1.19125e-02, -2.13733e-03 },
+    { -2.77751e-03, 1.70776e-02, -7.04471e-02, 9.59958e-01, 1.27812e-01, -4.21869e-02, 1.27445e-02, -2.28483e-03 },
+    { -2.94361e-03, 1.80759e-02, -7.43154e-02, 9.55956e-01, 1.36968e-01, -4.49929e-02, 1.35716e-02, -2.43121e-03 },
+    { -3.10689e-03, 1.90545e-02, -7.80792e-02, 9.51795e-01, 1.46192e-01, -4.77900e-02, 1.43934e-02, -2.57640e-03 },
+    { -3.26730e-03, 2.00132e-02, -8.17385e-02, 9.47477e-01, 1.55480e-01, -5.05770e-02, 1.52095e-02, -2.72032e-03 },
+    { -3.42477e-03, 2.09516e-02, -8.52933e-02, 9.43001e-01, 1.64831e-01, -5.33522e-02, 1.60193e-02, -2.86289e-03 },
+    { -3.57923e-03, 2.18695e-02, -8.87435e-02, 9.38371e-01, 1.74242e-01, -5.61142e-02, 1.68225e-02, -3.00403e-03 },
+    { -3.73062e-03, 2.27664e-02, -9.20893e-02, 9.33586e-01, 1.83711e-01, -5.88617e-02, 1.76185e-02, -3.14367e-03 },
+    { -3.87888e-03, 2.36423e-02, -9.53307e-02, 9.28650e-01, 1.93236e-01, -6.15931e-02, 1.84071e-02, -3.28174e-03 },
+    { -4.02397e-03, 2.44967e-02, -9.84679e-02, 9.23564e-01, 2.02814e-01, -6.43069e-02, 1.91877e-02, -3.41815e-03 },
+    { -4.16581e-03, 2.53295e-02, -1.01501e-01, 9.18329e-01, 2.12443e-01, -6.70018e-02, 1.99599e-02, -3.55283e-03 },
+    { -4.30435e-03, 2.61404e-02, -1.04430e-01, 9.12947e-01, 2.22120e-01, -6.96762e-02, 2.07233e-02, -3.68570e-03 },
+    { -4.43955e-03, 2.69293e-02, -1.07256e-01, 9.07420e-01, 2.31843e-01, -7.23286e-02, 2.14774e-02, -3.81671e-03 },
+    { -4.57135e-03, 2.76957e-02, -1.09978e-01, 9.01749e-01, 2.41609e-01, -7.49577e-02, 2.22218e-02, -3.94576e-03 },
+    { -4.69970e-03, 2.84397e-02, -1.12597e-01, 8.95936e-01, 2.51417e-01, -7.75620e-02, 2.29562e-02, -4.07279e-03 },
+    { -4.82456e-03, 2.91609e-02, -1.15113e-01, 8.89984e-01, 2.61263e-01, -8.01399e-02, 2.36801e-02, -4.19774e-03 },
+    { -4.94589e-03, 2.98593e-02, -1.17526e-01, 8.83893e-01, 2.71144e-01, -8.26900e-02, 2.43930e-02, -4.32052e-03 },
+    { -5.06363e-03, 3.05345e-02, -1.19837e-01, 8.77666e-01, 2.81060e-01, -8.52109e-02, 2.50946e-02, -4.44107e-03 },
+    { -5.17776e-03, 3.11866e-02, -1.22047e-01, 8.71305e-01, 2.91006e-01, -8.77011e-02, 2.57844e-02, -4.55932e-03 },
+    { -5.28823e-03, 3.18153e-02, -1.24154e-01, 8.64812e-01, 3.00980e-01, -9.01591e-02, 2.64621e-02, -4.67520e-03 },
+    { -5.39500e-03, 3.24205e-02, -1.26161e-01, 8.58189e-01, 3.10980e-01, -9.25834e-02, 2.71272e-02, -4.78866e-03 },
+    { -5.49804e-03, 3.30021e-02, -1.28068e-01, 8.51437e-01, 3.21004e-01, -9.49727e-02, 2.77794e-02, -4.89961e-03 },
+    { -5.59731e-03, 3.35600e-02, -1.29874e-01, 8.44559e-01, 3.31048e-01, -9.73254e-02, 2.84182e-02, -5.00800e-03 },
+    { -5.69280e-03, 3.40940e-02, -1.31581e-01, 8.37557e-01, 3.41109e-01, -9.96402e-02, 2.90433e-02, -5.11376e-03 },
+    { -5.78446e-03, 3.46042e-02, -1.33189e-01, 8.30432e-01, 3.51186e-01, -1.01915e-01, 2.96543e-02, -5.21683e-03 },
+    { -5.87227e-03, 3.50903e-02, -1.34699e-01, 8.23188e-01, 3.61276e-01, -1.04150e-01, 3.02507e-02, -5.31716e-03 },
+    { -5.95620e-03, 3.55525e-02, -1.36111e-01, 8.15826e-01, 3.71376e-01, -1.06342e-01, 3.08323e-02, -5.41467e-03 },
+    { -6.03624e-03, 3.59905e-02, -1.37426e-01, 8.08348e-01, 3.81484e-01, -1.08490e-01, 3.13987e-02, -5.50931e-03 },
+    { -6.11236e-03, 3.64044e-02, -1.38644e-01, 8.00757e-01, 3.91596e-01, -1.10593e-01, 3.19495e-02, -5.60103e-03 },
+    { -6.18454e-03, 3.67941e-02, -1.39767e-01, 7.93055e-01, 4.01710e-01, -1.12650e-01, 3.24843e-02, -5.68976e-03 },
+    { -6.25277e-03, 3.71596e-02, -1.40794e-01, 7.85244e-01, 4.11823e-01, -1.14659e-01, 3.30027e-02, -5.77544e-03 },
+    { -6.31703e-03, 3.75010e-02, -1.41727e-01, 7.77327e-01, 4.21934e-01, -1.16618e-01, 3.35046e-02, -5.85804e-03 },
+    { -6.37730e-03, 3.78182e-02, -1.42566e-01, 7.69305e-01, 4.32038e-01, -1.18526e-01, 3.39894e-02, -5.93749e-03 },
+    { -6.43358e-03, 3.81111e-02, -1.43313e-01, 7.61181e-01, 4.42134e-01, -1.20382e-01, 3.44568e-02, -6.01374e-03 },
+    { -6.48585e-03, 3.83800e-02, -1.43968e-01, 7.52958e-01, 4.52218e-01, -1.22185e-01, 3.49066e-02, -6.08674e-03 },
+    { -6.53412e-03, 3.86247e-02, -1.44531e-01, 7.44637e-01, 4.62289e-01, -1.23933e-01, 3.53384e-02, -6.15644e-03 },
+    { -6.57836e-03, 3.88454e-02, -1.45004e-01, 7.36222e-01, 4.72342e-01, -1.25624e-01, 3.57519e-02, -6.22280e-03 },
+    { -6.61859e-03, 3.90420e-02, -1.45387e-01, 7.27714e-01, 4.82377e-01, -1.27258e-01, 3.61468e-02, -6.28577e-03 },
+    { -6.65479e-03, 3.92147e-02, -1.45682e-01, 7.19116e-01, 4.92389e-01, -1.28832e-01, 3.65227e-02, -6.34530e-03 },
+    { -6.68698e-03, 3.93636e-02, -1.45889e-01, 7.10431e-01, 5.02377e-01, -1.30347e-01, 3.68795e-02, -6.40135e-03 },
+    { -6.71514e-03, 3.94886e-02, -1.46009e-01, 7.01661e-01, 5.12337e-01, -1.31800e-01, 3.72167e-02, -6.45388e-03 },
+    { -6.73929e-03, 3.95900e-02, -1.46043e-01, 6.92808e-01, 5.22267e-01, -1.33190e-01, 3.75341e-02, -6.50285e-03 },
+    { -6.75943e-03, 3.96678e-02, -1.45993e-01, 6.83875e-01, 5.32164e-01, -1.34515e-01, 3.78315e-02, -6.54823e-03 },
+    { -6.77557e-03, 3.97222e-02, -1.45859e-01, 6.74865e-01, 5.42025e-01, -1.35775e-01, 3.81085e-02, -6.58996e-03 },
+    { -6.78771e-03, 3.97532e-02, -1.45641e-01, 6.65779e-01, 5.51849e-01, -1.36969e-01, 3.83650e-02, -6.62802e-03 },
+    { -6.79588e-03, 3.97610e-02, -1.45343e-01, 6.56621e-01, 5.61631e-01, -1.38094e-01, 3.86006e-02, -6.66238e-03 },
+    { -6.80007e-03, 3.97458e-02, -1.44963e-01, 6.47394e-01, 5.71370e-01, -1.39150e-01, 3.88151e-02, -6.69300e-03 },
+    { -6.80032e-03, 3.97077e-02, -1.44503e-01, 6.38099e-01, 5.81063e-01, -1.40136e-01, 3.90083e-02, -6.71985e-03 },
+    { -6.79662e-03, 3.96469e-02, -1.43965e-01, 6.28739e-01, 5.90706e-01, -1.41050e-01, 3.91800e-02, -6.74291e-03 },
+    { -6.78902e-03, 3.95635e-02, -1.43350e-01, 6.19318e-01, 6.00298e-01, -1.41891e-01, 3.93299e-02, -6.76214e-03 },
+    { -6.77751e-03, 3.94578e-02, -1.42658e-01, 6.09836e-01, 6.09836e-01, -1.42658e-01, 3.94578e-02, -6.77751e-03 },
+    { -6.76214e-03, 3.93299e-02, -1.41891e-01, 6.00298e-01, 6.19318e-01, -1.43350e-01, 3.95635e-02, -6.78902e-03 },
+    { -6.74291e-03, 3.91800e-02, -1.41050e-01, 5.90706e-01, 6.28739e-01, -1.43965e-01, 3.96469e-02, -6.79662e-03 },
+    { -6.71985e-03, 3.90083e-02, -1.40136e-01, 5.81063e-01, 6.38099e-01, -1.44503e-01, 3.97077e-02, -6.80032e-03 },
+    { -6.69300e-03, 3.88151e-02, -1.39150e-01, 5.71370e-01, 6.47394e-01, -1.44963e-01, 3.97458e-02, -6.80007e-03 },
+    { -6.66238e-03, 3.86006e-02, -1.38094e-01, 5.61631e-01, 6.56621e-01, -1.45343e-01, 3.97610e-02, -6.79588e-03 },
+    { -6.62802e-03, 3.83650e-02, -1.36969e-01, 5.51849e-01, 6.65779e-01, -1.45641e-01, 3.97532e-02, -6.78771e-03 },
+    { -6.58996e-03, 3.81085e-02, -1.35775e-01, 5.42025e-01, 6.74865e-01, -1.45859e-01, 3.97222e-02, -6.77557e-03 },
+    { -6.54823e-03, 3.78315e-02, -1.34515e-01, 5.32164e-01, 6.83875e-01, -1.45993e-01, 3.96678e-02, -6.75943e-03 },
+    { -6.50285e-03, 3.75341e-02, -1.33190e-01, 5.22267e-01, 6.92808e-01, -1.46043e-01, 3.95900e-02, -6.73929e-03 },
+    { -6.45388e-03, 3.72167e-02, -1.31800e-01, 5.12337e-01, 7.01661e-01, -1.46009e-01, 3.94886e-02, -6.71514e-03 },
+    { -6.40135e-03, 3.68795e-02, -1.30347e-01, 5.02377e-01, 7.10431e-01, -1.45889e-01, 3.93636e-02, -6.68698e-03 },
+    { -6.34530e-03, 3.65227e-02, -1.28832e-01, 4.92389e-01, 7.19116e-01, -1.45682e-01, 3.92147e-02, -6.65479e-03 },
+    { -6.28577e-03, 3.61468e-02, -1.27258e-01, 4.82377e-01, 7.27714e-01, -1.45387e-01, 3.90420e-02, -6.61859e-03 },
+    { -6.22280e-03, 3.57519e-02, -1.25624e-01, 4.72342e-01, 7.36222e-01, -1.45004e-01, 3.88454e-02, -6.57836e-03 },
+    { -6.15644e-03, 3.53384e-02, -1.23933e-01, 4.62289e-01, 7.44637e-01, -1.44531e-01, 3.86247e-02, -6.53412e-03 },
+    { -6.08674e-03, 3.49066e-02, -1.22185e-01, 4.52218e-01, 7.52958e-01, -1.43968e-01, 3.83800e-02, -6.48585e-03 },
+    { -6.01374e-03, 3.44568e-02, -1.20382e-01, 4.42134e-01, 7.61181e-01, -1.43313e-01, 3.81111e-02, -6.43358e-03 },
+    { -5.93749e-03, 3.39894e-02, -1.18526e-01, 4.32038e-01, 7.69305e-01, -1.42566e-01, 3.78182e-02, -6.37730e-03 },
+    { -5.85804e-03, 3.35046e-02, -1.16618e-01, 4.21934e-01, 7.77327e-01, -1.41727e-01, 3.75010e-02, -6.31703e-03 },
+    { -5.77544e-03, 3.30027e-02, -1.14659e-01, 4.11823e-01, 7.85244e-01, -1.40794e-01, 3.71596e-02, -6.25277e-03 },
+    { -5.68976e-03, 3.24843e-02, -1.12650e-01, 4.01710e-01, 7.93055e-01, -1.39767e-01, 3.67941e-02, -6.18454e-03 },
+    { -5.60103e-03, 3.19495e-02, -1.10593e-01, 3.91596e-01, 8.00757e-01, -1.38644e-01, 3.64044e-02, -6.11236e-03 },
+    { -5.50931e-03, 3.13987e-02, -1.08490e-01, 3.81484e-01, 8.08348e-01, -1.37426e-01, 3.59905e-02, -6.03624e-03 },
+    { -5.41467e-03, 3.08323e-02, -1.06342e-01, 3.71376e-01, 8.15826e-01, -1.36111e-01, 3.55525e-02, -5.95620e-03 },
+    { -5.31716e-03, 3.02507e-02, -1.04150e-01, 3.61276e-01, 8.23188e-01, -1.34699e-01, 3.50903e-02, -5.87227e-03 },
+    { -5.21683e-03, 2.96543e-02, -1.01915e-01, 3.51186e-01, 8.30432e-01, -1.33189e-01, 3.46042e-02, -5.78446e-03 },
+    { -5.11376e-03, 2.90433e-02, -9.96402e-02, 3.41109e-01, 8.37557e-01, -1.31581e-01, 3.40940e-02, -5.69280e-03 },
+    { -5.00800e-03, 2.84182e-02, -9.73254e-02, 3.31048e-01, 8.44559e-01, -1.29874e-01, 3.35600e-02, -5.59731e-03 },
+    { -4.89961e-03, 2.77794e-02, -9.49727e-02, 3.21004e-01, 8.51437e-01, -1.28068e-01, 3.30021e-02, -5.49804e-03 },
+    { -4.78866e-03, 2.71272e-02, -9.25834e-02, 3.10980e-01, 8.58189e-01, -1.26161e-01, 3.24205e-02, -5.39500e-03 },
+    { -4.67520e-03, 2.64621e-02, -9.01591e-02, 3.00980e-01, 8.64812e-01, -1.24154e-01, 3.18153e-02, -5.28823e-03 },
+    { -4.55932e-03, 2.57844e-02, -8.77011e-02, 2.91006e-01, 8.71305e-01, -1.22047e-01, 3.11866e-02, -5.17776e-03 },
+    { -4.44107e-03, 2.50946e-02, -8.52109e-02, 2.81060e-01, 8.77666e-01, -1.19837e-01, 3.05345e-02, -5.06363e-03 },
+    { -4.32052e-03, 2.43930e-02, -8.26900e-02, 2.71144e-01, 8.83893e-01, -1.17526e-01, 2.98593e-02, -4.94589e-03 },
+    { -4.19774e-03, 2.36801e-02, -8.01399e-02, 2.61263e-01, 8.89984e-01, -1.15113e-01, 2.91609e-02, -4.82456e-03 },
+    { -4.07279e-03, 2.29562e-02, -7.75620e-02, 2.51417e-01, 8.95936e-01, -1.12597e-01, 2.84397e-02, -4.69970e-03 },
+    { -3.94576e-03, 2.22218e-02, -7.49577e-02, 2.41609e-01, 9.01749e-01, -1.09978e-01, 2.76957e-02, -4.57135e-03 },
+    { -3.81671e-03, 2.14774e-02, -7.23286e-02, 2.31843e-01, 9.07420e-01, -1.07256e-01, 2.69293e-02, -4.43955e-03 },
+    { -3.68570e-03, 2.07233e-02, -6.96762e-02, 2.22120e-01, 9.12947e-01, -1.04430e-01, 2.61404e-02, -4.30435e-03 },
+    { -3.55283e-03, 1.99599e-02, -6.70018e-02, 2.12443e-01, 9.18329e-01, -1.01501e-01, 2.53295e-02, -4.16581e-03 },
+    { -3.41815e-03, 1.91877e-02, -6.43069e-02, 2.02814e-01, 9.23564e-01, -9.84679e-02, 2.44967e-02, -4.02397e-03 },
+    { -3.28174e-03, 1.84071e-02, -6.15931e-02, 1.93236e-01, 9.28650e-01, -9.53307e-02, 2.36423e-02, -3.87888e-03 },
+    { -3.14367e-03, 1.76185e-02, -5.88617e-02, 1.83711e-01, 9.33586e-01, -9.20893e-02, 2.27664e-02, -3.73062e-03 },
+    { -3.00403e-03, 1.68225e-02, -5.61142e-02, 1.74242e-01, 9.38371e-01, -8.87435e-02, 2.18695e-02, -3.57923e-03 },
+    { -2.86289e-03, 1.60193e-02, -5.33522e-02, 1.64831e-01, 9.43001e-01, -8.52933e-02, 2.09516e-02, -3.42477e-03 },
+    { -2.72032e-03, 1.52095e-02, -5.05770e-02, 1.55480e-01, 9.47477e-01, -8.17385e-02, 2.00132e-02, -3.26730e-03 },
+    { -2.57640e-03, 1.43934e-02, -4.77900e-02, 1.46192e-01, 9.51795e-01, -7.80792e-02, 1.90545e-02, -3.10689e-03 },
+    { -2.43121e-03, 1.35716e-02, -4.49929e-02, 1.36968e-01, 9.55956e-01, -7.43154e-02, 1.80759e-02, -2.94361e-03 },
+    { -2.28483e-03, 1.27445e-02, -4.21869e-02, 1.27812e-01, 9.59958e-01, -7.04471e-02, 1.70776e-02, -2.77751e-03 },
+    { -2.13733e-03, 1.19125e-02, -3.93735e-02, 1.18725e-01, 9.63798e-01, -6.64743e-02, 1.60599e-02, -2.60868e-03 },
+    { -1.98880e-03, 1.10760e-02, -3.65541e-02, 1.09710e-01, 9.67477e-01, -6.23972e-02, 1.50233e-02, -2.43718e-03 },
+    { -1.83931e-03, 1.02356e-02, -3.37303e-02, 1.00769e-01, 9.70992e-01, -5.82159e-02, 1.39681e-02, -2.26307e-03 },
+    { -1.68894e-03, 9.39154e-03, -3.09033e-02, 9.19033e-02, 9.74342e-01, -5.39305e-02, 1.28947e-02, -2.08645e-03 },
+    { -1.53777e-03, 8.54441e-03, -2.80746e-02, 8.31162e-02, 9.77526e-01, -4.95412e-02, 1.18034e-02, -1.90738e-03 },
+    { -1.38589e-03, 7.69462e-03, -2.52457e-02, 7.44095e-02, 9.80543e-01, -4.50483e-02, 1.06946e-02, -1.72594e-03 },
+    { -1.23337e-03, 6.84261e-03, -2.24178e-02, 6.57852e-02, 9.83392e-01, -4.04519e-02, 9.56876e-03, -1.54221e-03 },
+    { -1.08030e-03, 5.98883e-03, -1.95925e-02, 5.72454e-02, 9.86071e-01, -3.57525e-02, 8.42626e-03, -1.35627e-03 },
+    { -9.26747e-04, 5.13372e-03, -1.67710e-02, 4.87921e-02, 9.88580e-01, -3.09503e-02, 7.26755e-03, -1.16820e-03 },
+    { -7.72802e-04, 4.27773e-03, -1.39548e-02, 4.04274e-02, 9.90917e-01, -2.60456e-02, 6.09305e-03, -9.78093e-04 },
+    { -6.18544e-04, 3.42130e-03, -1.11453e-02, 3.21531e-02, 9.93082e-01, -2.10389e-02, 4.90322e-03, -7.86031e-04 },
+    { -4.64053e-04, 2.56486e-03, -8.34364e-03, 2.39714e-02, 9.95074e-01, -1.59305e-02, 3.69852e-03, -5.92100e-04 },
+    { -3.09412e-04, 1.70888e-03, -5.55134e-03, 1.58840e-02, 9.96891e-01, -1.07209e-02, 2.47942e-03, -3.96391e-04 },
+    { -1.54700e-04, 8.53777e-04, -2.76968e-03, 7.89295e-03, 9.98534e-01, -5.41054e-03, 1.24642e-03, -1.98993e-04 },
+    { 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00 },
+};
diff --git a/core/src/dsp/pll.h b/core/src/dsp/pll.h
new file mode 100644
index 00000000..7d0c44ad
--- /dev/null
+++ b/core/src/dsp/pll.h
@@ -0,0 +1,185 @@
+#pragma once
+#include <dsp/block.h>
+#include <dsp/interpolation_taps.h>
+
+#define DSP_SIGN(n)     ((n) >= 0)
+#define DSP_STEP(n)     (((n) > 0.0f) ? 1.0f : -1.0f)
+
+namespace dsp {
+    class SqSymbolRecovery : public generic_block<SqSymbolRecovery> {
+    public:
+        SqSymbolRecovery() {}
+
+        SqSymbolRecovery(stream<float>* in, int omega) { init(in, omega); }
+
+        ~SqSymbolRecovery() {
+            generic_block<SqSymbolRecovery>::stop();
+        }
+
+        void init(stream<float>* in, int omega) {
+            _in = in;
+            samplesPerSymbol = omega;
+            generic_block<SqSymbolRecovery>::registerInput(_in);
+            generic_block<SqSymbolRecovery>::registerOutput(&out);
+        }
+
+        int run() {
+            count = _in->read();
+            if (count < 0) { return -1; }
+
+            int outCount = 0;
+
+            for (int i = 0; i < count; i++) {
+                if (DSP_SIGN(lastVal) != DSP_SIGN(_in->readBuf[i])) {
+                    counter = samplesPerSymbol / 2;
+                    lastVal = _in->readBuf[i];
+                    continue;
+                }
+
+                if (counter >= samplesPerSymbol) {
+                    counter = 0;
+                    out.writeBuf[outCount] = _in->readBuf[i];
+                    outCount++;
+                }
+                else {
+                    counter++;
+                }
+
+                lastVal = _in->readBuf[i];
+            }
+            
+            _in->flush();
+            if (!out.swap(outCount)) { return -1; }
+            return count;
+        }
+
+        stream<float> out;
+
+    private:
+        int count;
+        int samplesPerSymbol = 1;
+        int counter = 0;
+        float lastVal = 0;
+        stream<float>* _in;
+
+    };
+
+
+
+    class MMClockRecovery : public generic_block<MMClockRecovery> {
+    public:
+        MMClockRecovery() {}
+
+        MMClockRecovery(stream<float>* in, float omega, float gainOmega, float muGain, float omegaRelLimit) {
+            init(in, omega, gainOmega, muGain, omegaRelLimit);
+        }
+
+        ~MMClockRecovery() {
+            generic_block<MMClockRecovery>::stop();
+        }
+
+        void init(stream<float>* in, float omega, float gainOmega, float muGain, float omegaRelLimit) {
+            _in = in;
+            _omega = omega;
+            _muGain = muGain;
+            _gainOmega = gainOmega;
+            _omegaRelLimit = omegaRelLimit;
+
+            omegaMin = _omega - (_omega * _omegaRelLimit);
+            omegaMax = _omega + (_omega * _omegaRelLimit);
+            _dynOmega = _omega;
+
+            generic_block<MMClockRecovery>::registerInput(_in);
+            generic_block<MMClockRecovery>::registerOutput(&out);
+        }
+
+        int run() {
+            count = _in->read();
+            if (count < 0) { return -1; }
+
+            int outCount = 0;
+            float outVal;
+            float phaseError;
+            float roundedStep;
+            int maxOut = 2.0f * _omega * (float)count;
+
+            // Copy the first 7 values to the delay buffer for fast computing
+            memcpy(&delay[7], _in->readBuf, 7);
+
+            int i = nextOffset;
+            for (; i < count && outCount < maxOut;) {
+                // Calculate output value
+                // If we still need to use the old values, calculate using delay buf
+                // Otherwise, use normal buffer
+                if (i < 7) {
+                    volk_32f_x2_dot_prod_32f(&outVal, &delay[i], INTERP_TAPS[(int)roundf(_mu * 128.0f)], 8);
+                }
+                else {
+                    volk_32f_x2_dot_prod_32f(&outVal, &_in->readBuf[i - 7], INTERP_TAPS[(int)roundf(_mu * 128.0f)], 8);
+                }
+                out.writeBuf[outCount] = outVal;
+
+
+                // Cursed phase detect approximation (don't ask me how this approximation works)
+                phaseError = (DSP_STEP(lastOutput)*outVal) - (lastOutput*DSP_STEP(outVal));
+                lastOutput = outVal;
+                outCount++;
+
+                // Adjust the symbol rate using the phase error approximation and clamp
+                // TODO: Branchless clamp
+                _dynOmega = _dynOmega + (_gainOmega * phaseError);
+                if (_dynOmega > omegaMax) { _dynOmega = omegaMax; }
+                else if (_dynOmega < omegaMin) { _dynOmega = omegaMin; }
+
+                // Adjust the symbol phase according to the phase error approximation
+                // It will now contain the phase delta needed to jump to the next symbol
+                // Rounded step will contain the rounded number of symbols
+                _mu = _mu + _dynOmega + (_muGain * phaseError);
+                roundedStep = floor(_mu);
+
+                // Step to where the next symbol should be
+                i += (int)roundedStep;
+
+                // Now that we've stepped to the next symbol, keep only the offset inside the symbol
+                _mu -= roundedStep;
+            }
+
+            nextOffset = i - count;
+
+            // Save the last 7 values for the next round
+            memcpy(delay, &_in->readBuf[count - 7], 7);
+            
+            _in->flush();
+            if (!out.swap(outCount)) { return -1; }
+            return count;
+        }
+
+        stream<float> out;
+
+    private:
+        int count;
+
+        // Delay buffer
+        float delay[15];
+        int nextOffset = 0;
+
+        // Configuration
+        float _omega = 1.0f;
+        float _muGain = 1.0f;
+        float _gainOmega = 0.001f;
+        float _omegaRelLimit = 0.005;
+
+        // Precalculated values
+        float omegaMin = _omega + (_omega * _omegaRelLimit);
+        float omegaMax = _omega + (_omega * _omegaRelLimit);
+
+        // Runtime adjusted
+        float _dynOmega = _omega;
+        float _mu = 0.5f;
+        float lastOutput = 0.0f;
+        
+
+        stream<float>* _in;
+
+    };
+}
\ No newline at end of file
diff --git a/core/src/dsp/processing.h b/core/src/dsp/processing.h
index a775ceec..1dccbce4 100644
--- a/core/src/dsp/processing.h
+++ b/core/src/dsp/processing.h
@@ -3,6 +3,7 @@
 #include <volk/volk.h>
 #include <spdlog/spdlog.h>
 #include <string.h>
+#include <stdint.h>
 
 namespace dsp {
     template <class T>
@@ -151,6 +152,54 @@ namespace dsp {
 
     };
 
+    class FeedForwardAGC : public generic_block<FeedForwardAGC> {
+    public:
+        FeedForwardAGC() {}
+
+        FeedForwardAGC(stream<float>* in) { init(in); }
+
+        ~FeedForwardAGC() { generic_block<FeedForwardAGC>::stop(); }
+
+        void init(stream<float>* in) {
+            _in = in;
+            generic_block<FeedForwardAGC>::registerInput(_in);
+            generic_block<FeedForwardAGC>::registerOutput(&out);
+        }
+
+        void setInput(stream<float>* in) {
+            std::lock_guard<std::mutex> lck(generic_block<FeedForwardAGC>::ctrlMtx);
+            generic_block<FeedForwardAGC>::tempStop();
+            generic_block<FeedForwardAGC>::unregisterInput(_in);
+            _in = in;
+            generic_block<FeedForwardAGC>::registerInput(_in);
+            generic_block<FeedForwardAGC>::tempStart();
+        }
+
+        int run() {
+            count = _in->read();
+            if (count < 0) { return -1; }
+
+            float level = 0;
+            for (int i = 0; i < count; i++) {
+                if (fabs(_in->readBuf[i]) > level) { level = fabs(_in->readBuf[i]); }
+            }
+
+            volk_32f_s32f_multiply_32f(out.writeBuf, _in->readBuf, 1.0f / level, count);
+
+            _in->flush();
+            if (!out.swap(count)) { return -1; }
+            return count;
+        }
+
+        stream<float> out;
+
+    private:
+        int count;
+        stream<float>* _in;
+
+    };
+
+
     template <class T>
     class Volume : public generic_block<Volume<T>> {
     public:
@@ -365,4 +414,63 @@ namespace dsp {
         stream<T>* _in;
 
     };
+
+    class Threshold : public generic_block<Threshold> {
+    public:
+        Threshold() {}
+
+        Threshold(stream<float>* in) { init(in); }
+
+        ~Threshold() {
+            generic_block<Threshold>::stop();
+            delete[] normBuffer;
+        }
+
+        void init(stream<float>* in) {
+            _in = in;
+            normBuffer = new float[STREAM_BUFFER_SIZE];
+            generic_block<Threshold>::registerInput(_in);
+            generic_block<Threshold>::registerOutput(&out);
+        }
+
+        void setInput(stream<float>* in) {
+            std::lock_guard<std::mutex> lck(generic_block<Threshold>::ctrlMtx);
+            generic_block<Threshold>::tempStop();
+            generic_block<Threshold>::unregisterInput(_in);
+            _in = in;
+            generic_block<Threshold>::registerInput(_in);
+            generic_block<Threshold>::tempStart();
+        }
+
+        void setLevel(float level) {
+            _level = level;
+        }
+
+        float getLevel() {
+            return _level;
+        }
+
+        int run() {
+            count = _in->read();
+            if (count < 0) { return -1; }
+
+            for (int i = 0; i < count; i++) {
+                out.writeBuf[i] = (_in->readBuf[i] > 0.0f);
+            }
+
+            _in->flush();
+            if (!out.swap(count)) { return -1; }
+            return count; 
+        }
+
+        stream<uint8_t> out;
+
+
+    private:
+        int count;
+        float* normBuffer;
+        float _level = -50.0f;
+        stream<float>* _in;
+
+    };
 }
\ No newline at end of file
diff --git a/core/src/dsp/routing.h b/core/src/dsp/routing.h
index a1e821b0..42bf4446 100644
--- a/core/src/dsp/routing.h
+++ b/core/src/dsp/routing.h
@@ -151,27 +151,29 @@ namespace dsp {
         }
 
         void bufferWorker() {
-            complex_t* buf = new complex_t[_keep];
+            T* buf = new T[_keep];
             bool delay = _skip < 0;
 
             int readCount = std::min<int>(_keep + _skip, _keep);
             int skip = std::max<int>(_skip, 0);
-            int delaySize = (-_skip) * sizeof(complex_t);
+            int delaySize = (-_skip) * sizeof(T);
             int delayCount = (-_skip);
 
-            complex_t* start = &buf[std::max<int>(-_skip, 0)];
-            complex_t* delayStart = &buf[_keep + _skip];
+            T* start = &buf[std::max<int>(-_skip, 0)];
+            T* delayStart = &buf[_keep + _skip];
 
             while (true) {
                 if (delay) {
                     memmove(buf, delayStart, delaySize);
-                    for (int i = 0; i < delayCount; i++) {
-                        buf[i].i /= 10.0f;
-                        buf[i].q /= 10.0f;
+                    if constexpr (std::is_same_v<T, complex_t> || std::is_same_v<T, stereo_t>) {
+                        for (int i = 0; i < delayCount; i++) {
+                            buf[i].i /= 10.0f;
+                            buf[i].q /= 10.0f;
+                        }
                     }
                 }
                 if (ringBuf.readAndSkip(start, readCount, skip) < 0) { break; };
-                memcpy(out.writeBuf, buf, _keep * sizeof(complex_t));
+                memcpy(out.writeBuf, buf, _keep * sizeof(T));
                 if (!out.swap(_keep)) { break; }
             }
             delete[] buf;
diff --git a/core/src/dsp/source.h b/core/src/dsp/source.h
index b973f7b6..c7ed9da3 100644
--- a/core/src/dsp/source.h
+++ b/core/src/dsp/source.h
@@ -71,4 +71,43 @@ namespace dsp {
         lv_32fc_t* zeroPhase;
 
     };
+
+    template <class T>
+    class HandlerSource : public generic_block<HandlerSource<T>> {
+    public:
+        HandlerSource() {}
+
+        HandlerSource(int (*handler)(T* data, void* ctx), void* ctx) { init(handler, ctx); }
+
+        ~HandlerSource() { generic_block<HandlerSource<T>>::stop(); }
+
+        void init(int (*handler)(T* data, void* ctx), void* ctx) {
+            _handler = handler;
+            _ctx = ctx;
+            generic_block<HandlerSource<T>>::registerOutput(&out);
+        }
+
+        void setHandler(int (*handler)(T* data, void* ctx), void* ctx) {
+            std::lock_guard<std::mutex> lck(generic_block<HandlerSource<T>>::ctrlMtx);
+            generic_block<HandlerSource<T>>::tempStop();
+            _handler = handler;
+            _ctx = ctx;
+            generic_block<HandlerSource<T>>::tempStart();
+        }
+
+        int run() {
+            int count = _handler(out.writeBuf, _ctx);
+            if (count < 0) { return -1; }
+            out.swap(count);
+            return count;
+        }
+
+        stream<T> out;
+
+    private:
+        int count;
+        int (*_handler)(T* data, void* ctx);
+        void* _ctx;
+
+    };
 }
\ No newline at end of file
diff --git a/core/src/gui/widgets/symbol_diagram.cpp b/core/src/gui/widgets/symbol_diagram.cpp
new file mode 100644
index 00000000..4898a4fd
--- /dev/null
+++ b/core/src/gui/widgets/symbol_diagram.cpp
@@ -0,0 +1,42 @@
+#pragma once
+#include <gui/widgets/symbol_diagram.h>
+
+namespace ImGui {
+    SymbolDiagram::SymbolDiagram() {
+        memset(buffer, 0, 1024 * sizeof(float));
+    }
+
+    void SymbolDiagram::draw(const ImVec2& size_arg) {
+        std::lock_guard<std::mutex> lck(bufferMtx);
+        ImGuiWindow* window = GetCurrentWindow();
+        ImGuiStyle& style = GetStyle();
+        float pad = style.FramePadding.y;
+        ImVec2 min = window->DC.CursorPos;
+        ImVec2 size = CalcItemSize(size_arg, CalcItemWidth(), 100);
+        ImRect bb(min, ImVec2(min.x+size.x, min.y+size.y));
+        float lineHeight = size.y;
+
+        ItemSize(size, style.FramePadding.y);
+        if (!ItemAdd(bb, 0)) {
+            return;
+        }
+
+        window->DrawList->AddRectFilled(min, ImVec2(min.x+size.x, min.y+size.y), IM_COL32(0,0,0,255));
+        ImU32 col = ImGui::GetColorU32(ImGuiCol_CheckMark, 0.7f);
+        float increment = size.x / 1024.0f;
+        for (int i = 0; i < 1024; i++) {
+            if (buffer[i] > 1.0f || buffer[i] < -1.0f) { continue; }
+            window->DrawList->AddCircleFilled(ImVec2(((float)i * increment) + min.x, ((buffer[i] + 1) * (size.y*0.5f)) + min.y), 2, col);
+        }
+    }
+
+    float* SymbolDiagram::aquireBuffer() {
+        bufferMtx.lock();
+        return buffer;
+    }
+
+    void SymbolDiagram::releaseBuffer() {
+        bufferMtx.unlock();
+    }
+
+}
\ No newline at end of file
diff --git a/core/src/gui/widgets/symbol_diagram.h b/core/src/gui/widgets/symbol_diagram.h
new file mode 100644
index 00000000..43c9fa38
--- /dev/null
+++ b/core/src/gui/widgets/symbol_diagram.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <imgui.h>
+#include <imgui_internal.h>
+#include <dsp/stream.h>
+#include <mutex>
+
+namespace ImGui {
+    class SymbolDiagram {
+    public:
+        SymbolDiagram();
+
+        void draw(const ImVec2& size_arg = ImVec2(0, 0));
+
+        float* aquireBuffer();
+
+        void releaseBuffer();
+
+    private:
+        std::mutex bufferMtx;
+        float buffer[1024];
+
+    };
+}
\ No newline at end of file
diff --git a/core/src/version.h b/core/src/version.h
index 330e64fc..9d1afaf6 100644
--- a/core/src/version.h
+++ b/core/src/version.h
@@ -1,3 +1,3 @@
 #pragma once
 
-#define VERSION_STR "0.2.5_beta"
\ No newline at end of file
+#define VERSION_STR "0.3.0_beta"
\ No newline at end of file
diff --git a/falcon9_decoder/CMakeLists.txt b/falcon9_decoder/CMakeLists.txt
new file mode 100644
index 00000000..90f2cf2d
--- /dev/null
+++ b/falcon9_decoder/CMakeLists.txt
@@ -0,0 +1,20 @@
+cmake_minimum_required(VERSION 3.13)
+project(falcon9_decoder)
+
+if (MSVC)
+    set(CMAKE_CXX_FLAGS "-O2 /std:c++17 /EHsc")
+else()
+    set(CMAKE_CXX_FLAGS "-O3 -std=c++17 -fpermissive")
+endif (MSVC)
+
+file(GLOB_RECURSE SRC "src/*.cpp" "src/*.c")
+
+include_directories("src/")
+include_directories("src/libcorrect/")
+
+add_library(falcon9_decoder SHARED ${SRC})
+target_link_libraries(falcon9_decoder PRIVATE sdrpp_core)
+set_target_properties(falcon9_decoder PROPERTIES PREFIX "")
+
+# Install directives
+install(TARGETS falcon9_decoder DESTINATION lib/sdrpp/plugins)
\ No newline at end of file
diff --git a/falcon9_decoder/src/falcon_fec.h b/falcon9_decoder/src/falcon_fec.h
new file mode 100644
index 00000000..560895d6
--- /dev/null
+++ b/falcon9_decoder/src/falcon_fec.h
@@ -0,0 +1,127 @@
+#pragma once
+#include <dsp/block.h>
+#include <inttypes.h>
+
+// WTF???
+extern "C"
+{
+#include <correct.h>
+}
+
+const uint8_t toDB[] = { 
+0x00, 0x7b, 0xaf, 0xd4, 0x99, 0xe2, 0x36, 0x4d, 0xfa, 0x81, 0x55, 0x2e, 0x63, 0x18, 0xcc, 0xb7, 0x86, 0xfd, 0x29, 0x52, 0x1f,
+                                      0x64, 0xb0, 0xcb, 0x7c, 0x07, 0xd3, 0xa8, 0xe5, 0x9e, 0x4a, 0x31, 0xec, 0x97, 0x43, 0x38, 0x75, 0x0e, 0xda, 0xa1, 0x16, 0x6d, 0xb9, 0xc2, 0x8f, 0xf4,
+                                      0x20, 0x5b, 0x6a, 0x11, 0xc5, 0xbe, 0xf3, 0x88, 0x5c, 0x27, 0x90, 0xeb, 0x3f, 0x44, 0x09, 0x72, 0xa6, 0xdd, 0xef, 0x94, 0x40, 0x3b, 0x76, 0x0d, 0xd9,
+                                      0xa2, 0x15, 0x6e, 0xba, 0xc1, 0x8c, 0xf7, 0x23, 0x58, 0x69, 0x12, 0xc6, 0xbd, 0xf0, 0x8b, 0x5f, 0x24, 0x93, 0xe8, 0x3c, 0x47, 0x0a, 0x71, 0xa5, 0xde,
+                                      0x03, 0x78, 0xac, 0xd7, 0x9a, 0xe1, 0x35, 0x4e, 0xf9, 0x82, 0x56, 0x2d, 0x60, 0x1b, 0xcf, 0xb4, 0x85, 0xfe, 0x2a, 0x51, 0x1c, 0x67, 0xb3, 0xc8, 0x7f,
+                                      0x04, 0xd0, 0xab, 0xe6, 0x9d, 0x49, 0x32, 0x8d, 0xf6, 0x22, 0x59, 0x14, 0x6f, 0xbb, 0xc0, 0x77, 0x0c, 0xd8, 0xa3, 0xee, 0x95, 0x41, 0x3a, 0x0b, 0x70,
+                                      0xa4, 0xdf, 0x92, 0xe9, 0x3d, 0x46, 0xf1, 0x8a, 0x5e, 0x25, 0x68, 0x13, 0xc7, 0xbc, 0x61, 0x1a, 0xce, 0xb5, 0xf8, 0x83, 0x57, 0x2c, 0x9b, 0xe0, 0x34,
+                                      0x4f, 0x02, 0x79, 0xad, 0xd6, 0xe7, 0x9c, 0x48, 0x33, 0x7e, 0x05, 0xd1, 0xaa, 0x1d, 0x66, 0xb2, 0xc9, 0x84, 0xff, 0x2b, 0x50, 0x62, 0x19, 0xcd, 0xb6,
+                                      0xfb, 0x80, 0x54, 0x2f, 0x98, 0xe3, 0x37, 0x4c, 0x01, 0x7a, 0xae, 0xd5, 0xe4, 0x9f, 0x4b, 0x30, 0x7d, 0x06, 0xd2, 0xa9, 0x1e, 0x65, 0xb1, 0xca, 0x87,
+                                      0xfc, 0x28, 0x53, 0x8e, 0xf5, 0x21, 0x5a, 0x17, 0x6c, 0xb8, 0xc3, 0x74, 0x0f, 0xdb, 0xa0, 0xed, 0x96, 0x42, 0x39, 0x08, 0x73, 0xa7, 0xdc, 0x91, 0xea,
+                                      0x3e, 0x45, 0xf2, 0x89, 0x5d, 0x26, 0x6b, 0x10, 0xc4, 0xbf
+};
+
+const uint8_t fromDB[] = {
+    0x00, 0xcc, 0xac, 0x60, 0x79, 0xb5, 0xd5, 0x19, 0xf0, 0x3c, 0x5c, 0x90, 0x89, 0x45, 0x25, 0xe9, 0xfd, 0x31, 0x51, 0x9d,
+                                        0x84, 0x48, 0x28, 0xe4, 0x0d, 0xc1, 0xa1, 0x6d, 0x74, 0xb8, 0xd8, 0x14, 0x2e, 0xe2, 0x82, 0x4e, 0x57, 0x9b, 0xfb, 0x37, 0xde, 0x12, 0x72, 0xbe, 0xa7,
+                                        0x6b, 0x0b, 0xc7, 0xd3, 0x1f, 0x7f, 0xb3, 0xaa, 0x66, 0x06, 0xca, 0x23, 0xef, 0x8f, 0x43, 0x5a, 0x96, 0xf6, 0x3a, 0x42, 0x8e, 0xee, 0x22, 0x3b, 0xf7,
+                                        0x97, 0x5b, 0xb2, 0x7e, 0x1e, 0xd2, 0xcb, 0x07, 0x67, 0xab, 0xbf, 0x73, 0x13, 0xdf, 0xc6, 0x0a, 0x6a, 0xa6, 0x4f, 0x83, 0xe3, 0x2f, 0x36, 0xfa, 0x9a,
+                                        0x56, 0x6c, 0xa0, 0xc0, 0x0c, 0x15, 0xd9, 0xb9, 0x75, 0x9c, 0x50, 0x30, 0xfc, 0xe5, 0x29, 0x49, 0x85, 0x91, 0x5d, 0x3d, 0xf1, 0xe8, 0x24, 0x44, 0x88,
+                                        0x61, 0xad, 0xcd, 0x01, 0x18, 0xd4, 0xb4, 0x78, 0xc5, 0x09, 0x69, 0xa5, 0xbc, 0x70, 0x10, 0xdc, 0x35, 0xf9, 0x99, 0x55, 0x4c, 0x80, 0xe0, 0x2c, 0x38,
+                                        0xf4, 0x94, 0x58, 0x41, 0x8d, 0xed, 0x21, 0xc8, 0x04, 0x64, 0xa8, 0xb1, 0x7d, 0x1d, 0xd1, 0xeb, 0x27, 0x47, 0x8b, 0x92, 0x5e, 0x3e, 0xf2, 0x1b, 0xd7,
+                                        0xb7, 0x7b, 0x62, 0xae, 0xce, 0x02, 0x16, 0xda, 0xba, 0x76, 0x6f, 0xa3, 0xc3, 0x0f, 0xe6, 0x2a, 0x4a, 0x86, 0x9f, 0x53, 0x33, 0xff, 0x87, 0x4b, 0x2b,
+                                        0xe7, 0xfe, 0x32, 0x52, 0x9e, 0x77, 0xbb, 0xdb, 0x17, 0x0e, 0xc2, 0xa2, 0x6e, 0x7a, 0xb6, 0xd6, 0x1a, 0x03, 0xcf, 0xaf, 0x63, 0x8a, 0x46, 0x26, 0xea,
+                                        0xf3, 0x3f, 0x5f, 0x93, 0xa9, 0x65, 0x05, 0xc9, 0xd0, 0x1c, 0x7c, 0xb0, 0x59, 0x95, 0xf5, 0x39, 0x20, 0xec, 0x8c, 0x40, 0x54, 0x98, 0xf8, 0x34, 0x2d,
+                                        0xe1, 0x81, 0x4d, 0xa4, 0x68, 0x08, 0xc4, 0xdd, 0x11, 0x71, 0xbd
+};
+
+const uint8_t randVals[] = {
+    0xFF, 0x48, 0x0E, 0xC0, 0x9A, 0x0D, 0x70, 0xBC, 0x8E, 0x2C, 0x93, 0xAD, 0xA7, 0xB7, 0x46, 0xCE,
+    0x5A, 0x97, 0x7D, 0xCC, 0x32, 0xA2, 0xBF, 0x3E, 0x0A, 0x10, 0xF1, 0x88, 0x94, 0xCD, 0xEA, 0xB1,
+    0xFE, 0x90, 0x1D, 0x81, 0x34, 0x1A, 0xE1, 0x79, 0x1C, 0x59, 0x27, 0x5B, 0x4F, 0x6E, 0x8D, 0x9C,
+    0xB5, 0x2E, 0xFB, 0x98, 0x65, 0x45, 0x7E, 0x7C, 0x14, 0x21, 0xE3, 0x11, 0x29, 0x9B, 0xD5, 0x63,
+    0xFD, 0x20, 0x3B, 0x02, 0x68, 0x35, 0xC2, 0xF2, 0x38, 0xB2, 0x4E, 0xB6, 0x9E, 0xDD, 0x1B, 0x39,
+    0x6A, 0x5D, 0xF7, 0x30, 0xCA, 0x8A, 0xFC, 0xF8, 0x28, 0x43, 0xC6, 0x22, 0x53, 0x37, 0xAA, 0xC7,
+    0xFA, 0x40, 0x76, 0x04, 0xD0, 0x6B, 0x85, 0xE4, 0x71, 0x64, 0x9D, 0x6D, 0x3D, 0xBA, 0x36, 0x72,
+    0xD4, 0xBB, 0xEE, 0x61, 0x95, 0x15, 0xF9, 0xF0, 0x50, 0x87, 0x8C, 0x44, 0xA6, 0x6F, 0x55, 0x8F,
+    0xF4, 0x80, 0xEC, 0x09, 0xA0, 0xD7, 0x0B, 0xC8, 0xE2, 0xC9, 0x3A, 0xDA, 0x7B, 0x74, 0x6C, 0xE5,
+    0xA9, 0x77, 0xDC, 0xC3, 0x2A, 0x2B, 0xF3, 0xE0, 0xA1, 0x0F, 0x18, 0x89, 0x4C, 0xDE, 0xAB, 0x1F,
+    0xE9, 0x01, 0xD8, 0x13, 0x41, 0xAE, 0x17, 0x91, 0xC5, 0x92, 0x75, 0xB4, 0xF6, 0xE8, 0xD9, 0xCB,
+    0x52, 0xEF, 0xB9, 0x86, 0x54, 0x57, 0xE7, 0xC1, 0x42, 0x1E, 0x31, 0x12, 0x99, 0xBD, 0x56, 0x3F,
+    0xD2, 0x03, 0xB0, 0x26, 0x83, 0x5C, 0x2F, 0x23, 0x8B, 0x24, 0xEB, 0x69, 0xED, 0xD1, 0xB3, 0x96,
+    0xA5, 0xDF, 0x73, 0x0C, 0xA8, 0xAF, 0xCF, 0x82, 0x84, 0x3C, 0x62, 0x25, 0x33, 0x7A, 0xAC, 0x7F,
+    0xA4, 0x07, 0x60, 0x4D, 0x06, 0xB8, 0x5E, 0x47, 0x16, 0x49, 0xD6, 0xD3, 0xDB, 0xA3, 0x67, 0x2D,
+    0x4B, 0xBE, 0xE6, 0x19, 0x51, 0x5F, 0x9F, 0x05, 0x08, 0x78, 0xC4, 0x4A, 0x66, 0xF5, 0x58
+};
+
+namespace dsp {
+    class FalconRS : public generic_block<FalconRS> {
+    public:
+        FalconRS() {}
+
+        FalconRS(stream<uint8_t>* in) { init(in); }
+
+        ~FalconRS() {
+            generic_block<FalconRS>::stop();
+        }
+
+        void init(stream<uint8_t>* in) {
+            _in = in;
+
+            for (int i = 0; i < 5; i++) { memset(buffers[i], 0, 255); }
+            for (int i = 0; i < 5; i++) { memset(outBuffers[i], 0, 255); }
+            rs = correct_reed_solomon_create(correct_rs_primitive_polynomial_ccsds, 120, 11, 16);
+            if (rs == NULL) { printf("Error creating the reed solomon decoder\n"); }
+            
+            generic_block<FalconRS>::registerInput(_in);
+            generic_block<FalconRS>::registerOutput(&out);
+        }
+
+        int run() {
+            count = _in->read();
+            if (count < 0) { return -1; }
+
+            uint8_t* data = _in->readBuf + 4;
+
+            // Deinterleave
+            for (int i = 0; i < 255*5; i++) {
+                buffers[i%5][i/5] = fromDB[data[i]];
+            }
+
+            // Reed the solomon :weary:
+            int result = 0;
+            result = correct_reed_solomon_decode(rs, buffers[0], 255, outBuffers[0]);
+            if (result == -1) { _in->flush(); return count; }
+            result = correct_reed_solomon_decode(rs, buffers[1], 255, outBuffers[1]);
+            if (result == -1) { _in->flush(); return count; }
+            result = correct_reed_solomon_decode(rs, buffers[2], 255, outBuffers[2]);
+            if (result == -1) { _in->flush(); return count; }
+            result = correct_reed_solomon_decode(rs, buffers[3], 255, outBuffers[3]);
+            if (result == -1) { _in->flush(); return count; }
+            result = correct_reed_solomon_decode(rs, buffers[4], 255, outBuffers[4]);
+            if (result == -1) { _in->flush(); return count; }
+
+            // Reinterleave
+            for (int i = 0; i < 255*5; i++) {
+                out.writeBuf[i] = toDB[outBuffers[i%5][i/5]] ^ randVals[i % 255];
+            }
+
+            out.swap(255*5);
+
+            _in->flush();
+            return count;
+        }
+
+        stream<uint8_t> out;
+
+    private:
+        int count;
+        uint8_t buffers[5][255];
+        uint8_t outBuffers[5][255];
+        correct_reed_solomon* rs;
+        
+        stream<uint8_t>* _in;
+
+    };
+}
\ No newline at end of file
diff --git a/falcon9_decoder/src/falcon_packet.h b/falcon9_decoder/src/falcon_packet.h
new file mode 100644
index 00000000..e1d7aca9
--- /dev/null
+++ b/falcon9_decoder/src/falcon_packet.h
@@ -0,0 +1,121 @@
+#pragma once
+#include <dsp/block.h>
+#include <inttypes.h>
+
+namespace dsp {
+    struct FalconFrameHeader {
+        uint32_t counter;
+        uint16_t packet;
+    };
+
+    class FalconPacketSync : public generic_block<FalconPacketSync> {
+    public:
+        FalconPacketSync() {}
+
+        FalconPacketSync(stream<uint8_t>* in) { init(in); }
+
+        ~FalconPacketSync() {
+            generic_block<FalconPacketSync>::stop();
+        }
+
+        void init(stream<uint8_t>* in) {
+            _in = in;
+            
+            generic_block<FalconPacketSync>::registerInput(_in);
+            generic_block<FalconPacketSync>::registerOutput(&out);
+        }
+
+        int run() {
+            count = _in->read();
+            if (count < 0) { return -1; }
+
+            // Parse frame header
+            FalconFrameHeader header;
+            header.packet = (_in->readBuf[3] | ((_in->readBuf[2] & 0b111) << 8));
+            header.counter = ((_in->readBuf[2] >> 3) | (_in->readBuf[1] << 5) | ((_in->readBuf[0] & 0b111111) << 13));
+
+            // Pointer to the data aera of the frame
+            uint8_t* data = _in->readBuf + 4;
+            int dataLen = 1191;
+
+            // If a frame was missed, cancel reading the current packet
+            if (lastCounter + 1 != header.counter) {
+                packetRead = -1;
+            }
+            lastCounter = header.counter;
+
+            // If frame is just a continuation of a single packet, save it
+            // If we're not currently reading a packet
+            if (header.packet == 2047 && packetRead >= 0) {
+                memcpy(packet + packetRead, data, dataLen);
+                packetRead += dataLen;
+                _in->flush();
+                printf("Wow, all data\n");
+                return count;
+            }
+            else if (header.packet == 2047) {
+                printf("Wow, all data\n");
+                _in->flush();
+                return count; 
+            }
+
+            // Finish reading the last package and send it
+            if (packetRead >= 0) {
+                memcpy(packet + packetRead, data, header.packet);
+                memcpy(out.writeBuf, packet, packetRead + header.packet);
+                out.swap(packetRead + header.packet);
+                packetRead = -1;
+            }
+
+            // Iterate through every packet of the frame
+            for (int i = header.packet; i < dataLen;) {
+                // First, check if we can read the header. If not, save and wait for next frame
+                if (dataLen - i < 4) {
+                    packetRead = dataLen - i;
+                    memcpy(packet, &data[i], packetRead);
+                    break;
+                }
+
+                // Extract packet length
+                uint16_t length = (((data[i] & 0b1111) << 8) | data[i + 1]) + 2;
+
+                // Check if it's not an invalid zero length packet
+                if (length <= 2) {
+                    packetRead = -1;
+                    break;
+                }
+                
+                uint64_t pktId =    ((uint64_t)data[i + 2] << 56) | ((uint64_t)data[i + 3] << 48) | ((uint64_t)data[i + 4] << 40) | ((uint64_t)data[i + 5] << 32)
+                                |   ((uint64_t)data[i + 6] << 24) | ((uint64_t)data[i + 7] << 16) | ((uint64_t)data[i + 8] << 8) | data[i + 9];
+
+                // If the packet doesn't fit the frame, save and go to next frame
+                if (dataLen - i < length) {
+                    packetRead = dataLen - i;
+                    memcpy(packet, &data[i], packetRead);
+                    break;
+                }
+
+                // Here, the package fits fully, read it and jump to the next
+                memcpy(out.writeBuf, &data[i], length);
+                out.swap(length);
+                i += length;
+
+            }
+
+            _in->flush();
+            return count;
+        }
+
+        stream<uint8_t> out;
+
+    private:
+        int count;
+        uint32_t lastCounter = 0;
+
+        int packetRead = -1;
+        uint8_t packet[0x4008];
+        
+        stream<uint8_t>* _in;
+
+    };
+}
\ No newline at end of file
diff --git a/falcon9_decoder/src/libcorrect/convolutional/CMakeLists.txt b/falcon9_decoder/src/libcorrect/convolutional/CMakeLists.txt
new file mode 100644
index 00000000..43c4bd9d
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/convolutional/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(SRCFILES bit.c metric.c history_buffer.c error_buffer.c lookup.c convolutional.c encode.c decode.c)
+add_library(correct-convolutional OBJECT ${SRCFILES})
+if(HAVE_SSE)
+    add_subdirectory(sse)
+endif()
diff --git a/falcon9_decoder/src/libcorrect/convolutional/bit.c b/falcon9_decoder/src/libcorrect/convolutional/bit.c
new file mode 100644
index 00000000..9ac4e1ef
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/convolutional/bit.c
@@ -0,0 +1,232 @@
+#include "correct/convolutional/bit.h"
+
+bit_writer_t *bit_writer_create(uint8_t *bytes, size_t len) {
+    bit_writer_t *w = calloc(1, sizeof(bit_writer_t));
+
+    if (bytes) {
+        bit_writer_reconfigure(w, bytes, len);
+    }
+
+    return w;
+}
+
+void bit_writer_reconfigure(bit_writer_t *w, uint8_t *bytes, size_t len) {
+    w->bytes = bytes;
+    w->len = len;
+
+    w->current_byte = 0;
+    w->current_byte_len = 0;
+    w->byte_index = 0;
+}
+
+void bit_writer_destroy(bit_writer_t *w) {
+    free(w);
+}
+
+void bit_writer_write(bit_writer_t *w, uint8_t val, unsigned int n) {
+    for (size_t j = 0; j < n; j++) {
+        bit_writer_write_1(w, val);
+        val >>= 1;
+    }
+}
+
+void bit_writer_write_1(bit_writer_t *w, uint8_t val) {
+    w->current_byte |= val & 1;
+    w->current_byte_len++;
+
+    if (w->current_byte_len == 8) {
+        // 8 bits in a byte -- move to the next byte
+        w->bytes[w->byte_index] = w->current_byte;
+        w->byte_index++;
+        w->current_byte_len = 0;
+        w->current_byte = 0;
+    } else {
+        w->current_byte <<= 1;
+    }
+}
+
+void bit_writer_write_bitlist(bit_writer_t *w, uint8_t *l, size_t len) {
+    // first close the current byte
+    // we might have been given too few elements to do that. be careful.
+    size_t close_len = 8 - w->current_byte_len;
+    close_len = (close_len < len) ? close_len : len;
+
+    uint16_t b = w->current_byte;
+
+    for (ptrdiff_t i = 0; i < close_len; i++) {
+        b |= l[i];
+        b <<= 1;
+    }
+
+
+    l += close_len;
+    len -= close_len;
+
+    uint8_t *bytes = w->bytes;
+    size_t byte_index = w->byte_index;
+
+    if (w->current_byte_len + close_len == 8) {
+        b >>= 1;
+        bytes[byte_index] = b;
+        byte_index++;
+    } else {
+        w->current_byte = b;
+        w->current_byte_len += close_len;
+        return;
+    }
+
+    size_t full_bytes = len/8;
+
+    for (size_t i = 0; i < full_bytes; i++) {
+        bytes[byte_index] = l[0] << 7 | l[1] << 6 | l[2] << 5 |
+                            l[3] << 4 | l[4] << 3 | l[5] << 2 |
+                            l[6] << 1 | l[7];
+        byte_index += 1;
+        l += 8;
+    }
+
+    len -= 8*full_bytes;
+
+    b = 0;
+    for (ptrdiff_t i = 0; i < len; i++) {
+        b |= l[i];
+        b <<= 1;
+    }
+
+    w->current_byte = b;
+    w->byte_index = byte_index;
+    w->current_byte_len = len;
+}
+
+void bit_writer_write_bitlist_reversed(bit_writer_t *w, uint8_t *l, size_t len) {
+    l = l + len - 1;
+
+    uint8_t *bytes = w->bytes;
+    size_t byte_index = w->byte_index;
+    uint16_t b;
+
+    if (w->current_byte_len != 0) {
+        size_t close_len = 8 - w->current_byte_len;
+        close_len = (close_len < len) ? close_len : len;
+
+        b = w->current_byte;
+
+        for (ptrdiff_t i = 0; i < close_len; i++) {
+            b |= *l;
+            b <<= 1;
+            l--;
+        }
+
+        len -= close_len;
+
+        if (w->current_byte_len + close_len == 8) {
+            b >>= 1;
+            bytes[byte_index] = b;
+            byte_index++;
+        } else {
+            w->current_byte = b;
+            w->current_byte_len += close_len;
+            return;
+        }
+    }
+
+    size_t full_bytes = len/8;
+
+    for (size_t i = 0; i < full_bytes; i++) {
+        bytes[byte_index] = l[0] << 7 | l[-1] << 6 | l[-2] << 5 |
+                            l[-3] << 4 | l[-4] << 3 | l[-5] << 2 |
+                            l[-6] << 1 | l[-7];
+        byte_index += 1;
+        l -= 8;
+    }
+
+    len -= 8*full_bytes;
+
+    b = 0;
+    for (ptrdiff_t i = 0; i < len; i++) {
+        b |= *l;
+        b <<= 1;
+        l--;
+    }
+
+    w->current_byte = (uint8_t)b;
+    w->byte_index = byte_index;
+    w->current_byte_len = len;
+}
+
+void bit_writer_flush_byte(bit_writer_t *w) {
+    if (w->current_byte_len != 0) {
+        w->current_byte <<= (8 - w->current_byte_len);
+        w->bytes[w->byte_index] = w->current_byte;
+        w->byte_index++;
+        w->current_byte_len = 0;
+    }
+}
+
+size_t bit_writer_length(bit_writer_t *w) {
+    return w->byte_index;
+}
+
+uint8_t reverse_byte(uint8_t b) {
+    return (b & 0x80) >> 7 | (b & 0x40) >> 5 | (b & 0x20) >> 3 |
+           (b & 0x10) >> 1 | (b & 0x08) << 1 | (b & 0x04) << 3 |
+           (b & 0x02) << 5 | (b & 0x01) << 7;
+}
+
+static uint8_t reverse_table[256];
+
+void create_reverse_table() {
+    for (uint16_t i = 0; i < 256; i++) {
+        reverse_table[i] = reverse_byte(i);
+    }
+}
+
+bit_reader_t *bit_reader_create(const uint8_t *bytes, size_t len) {
+    bit_reader_t *r = calloc(1, sizeof(bit_reader_t));
+
+    static bool reverse_table_created = false;
+
+    if (!reverse_table_created) {
+        create_reverse_table();
+        reverse_table_created = true;
+    }
+
+    if (bytes) {
+        bit_reader_reconfigure(r, bytes, len);
+    }
+
+    return r;
+}
+
+void bit_reader_reconfigure(bit_reader_t *r, const uint8_t *bytes, size_t len) {
+    r->bytes = bytes;
+    r->len = len;
+
+    r->current_byte_len = 8;
+    r->current_byte = bytes[0];
+    r->byte_index = 0;
+}
+
+void bit_reader_destroy(bit_reader_t *r) {
+    free(r);
+}
+
+uint8_t bit_reader_read(bit_reader_t *r, unsigned int n) {
+    unsigned int read = 0;
+    unsigned int n_copy = n;
+
+    if (r->current_byte_len < n) {
+        read = r->current_byte & ((1 << r->current_byte_len) - 1);
+        r->byte_index++;
+        r->current_byte = r->bytes[r->byte_index];
+        n -= r->current_byte_len;
+        r->current_byte_len = 8;
+        read <<= n;
+    }
+
+    uint8_t copy_mask = (1 << n) - 1;
+    copy_mask <<= (r->current_byte_len - n);
+    read |= (r->current_byte & copy_mask) >> (r->current_byte_len - n);
+    r->current_byte_len -= n;
+    return reverse_table[read] >> (8 - n_copy);
+}
diff --git a/falcon9_decoder/src/libcorrect/convolutional/convolutional.c b/falcon9_decoder/src/libcorrect/convolutional/convolutional.c
new file mode 100644
index 00000000..910ed156
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/convolutional/convolutional.c
@@ -0,0 +1,59 @@
+#include "correct/convolutional/convolutional.h"
+
+// https://www.youtube.com/watch?v=b3_lVSrPB6w
+
+correct_convolutional *_correct_convolutional_init(correct_convolutional *conv,
+                                                   size_t rate, size_t order,
+                                                   const polynomial_t *poly) {
+    if (order > 8 * sizeof(shift_register_t)) {
+        // XXX turn this into an error code
+        // printf("order must be smaller than 8 * sizeof(shift_register_t)\n");
+        return NULL;
+    }
+    if (rate < 2) {
+        // XXX turn this into an error code
+        // printf("rate must be 2 or greater\n");
+        return NULL;
+    }
+
+    conv->order = order;
+    conv->rate = rate;
+    conv->numstates = 1 << order;
+
+    unsigned int *table = malloc(sizeof(unsigned int) * (1 << order));
+    fill_table(conv->rate, conv->order, poly, table);
+    *(unsigned int **)&conv->table = table;
+
+    conv->bit_writer = bit_writer_create(NULL, 0);
+    conv->bit_reader = bit_reader_create(NULL, 0);
+
+    conv->has_init_decode = false;
+    return conv;
+}
+
+correct_convolutional *correct_convolutional_create(size_t rate, size_t order,
+                                                    const polynomial_t *poly) {
+    correct_convolutional *conv = malloc(sizeof(correct_convolutional));
+    correct_convolutional *init_conv = _correct_convolutional_init(conv, rate, order, poly);
+    if (!init_conv) {
+        free(conv);
+    }
+    return init_conv;
+}
+
+void _correct_convolutional_teardown(correct_convolutional *conv) {
+    free(*(unsigned int **)&conv->table);
+    bit_writer_destroy(conv->bit_writer);
+    bit_reader_destroy(conv->bit_reader);
+    if (conv->has_init_decode) {
+        pair_lookup_destroy(conv->pair_lookup);
+        history_buffer_destroy(conv->history_buffer);
+        error_buffer_destroy(conv->errors);
+        free(conv->distances);
+    }
+}
+
+void correct_convolutional_destroy(correct_convolutional *conv) {
+    _correct_convolutional_teardown(conv);
+    free(conv);
+}
diff --git a/falcon9_decoder/src/libcorrect/convolutional/decode.c b/falcon9_decoder/src/libcorrect/convolutional/decode.c
new file mode 100644
index 00000000..e8891e04
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/convolutional/decode.c
@@ -0,0 +1,321 @@
+#include "correct/convolutional/convolutional.h"
+
+void conv_decode_print_iter(correct_convolutional *conv, unsigned int iter,
+                            unsigned int winner_index) {
+    if (iter < 2220) {
+        return;
+    }
+    printf("iteration: %d\n", iter);
+    distance_t *errors = conv->errors->write_errors;
+    printf("errors:\n");
+    for (shift_register_t i = 0; i < conv->numstates / 2; i++) {
+        printf("%2d: %d\n", i, errors[i]);
+    }
+    printf("\n");
+    printf("history:\n");
+    for (shift_register_t i = 0; i < conv->numstates / 2; i++) {
+        printf("%2d: ", i);
+        for (unsigned int j = 0; j <= winner_index; j++) {
+            printf("%d", conv->history_buffer->history[j][i] ? 1 : 0);
+        }
+        printf("\n");
+    }
+    printf("\n");
+}
+
+void convolutional_decode_warmup(correct_convolutional *conv, unsigned int sets,
+                                 const uint8_t *soft) {
+    // first phase: load shiftregister up from 0 (order goes from 1 to conv->order)
+    // we are building up error metrics for the first order bits
+    for (unsigned int i = 0; i < conv->order - 1 && i < sets; i++) {
+        // peel off rate bits from encoded to recover the same `out` as in the encoding process
+        // the difference being that this `out` will have the channel noise/errors applied
+        unsigned int out;
+        if (!soft) {
+            out = bit_reader_read(conv->bit_reader, conv->rate);
+        }
+        const distance_t *read_errors = conv->errors->read_errors;
+        distance_t *write_errors = conv->errors->write_errors;
+        // walk all of the state we have so far
+        for (size_t j = 0; j < (1 << (i + 1)); j += 1) {
+            unsigned int last = j >> 1;
+            distance_t dist;
+            if (soft) {
+                if (conv->soft_measurement == CORRECT_SOFT_LINEAR) {
+                    dist = metric_soft_distance_linear(conv->table[j], soft + i * conv->rate,
+                                                       conv->rate);
+                } else {
+                    dist = metric_soft_distance_quadratic(conv->table[j], soft + i * conv->rate,
+                                                          conv->rate);
+                }
+            } else {
+                dist = metric_distance(conv->table[j], out);
+            }
+            write_errors[j] = dist + read_errors[last];
+        }
+        error_buffer_swap(conv->errors);
+    }
+}
+
+void convolutional_decode_inner(correct_convolutional *conv, unsigned int sets,
+                                const uint8_t *soft) {
+    shift_register_t highbit = 1 << (conv->order - 1);
+    for (unsigned int i = conv->order - 1; i < (sets - conv->order + 1); i++) {
+        distance_t *distances = conv->distances;
+        // lasterrors are the aggregate bit errors for the states of shiftregister for the previous
+        // time slice
+        if (soft) {
+            if (conv->soft_measurement == CORRECT_SOFT_LINEAR) {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_linear(j, soft + i * conv->rate, conv->rate);
+                }
+            } else {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_quadratic(j, soft + i * conv->rate, conv->rate);
+                }
+            }
+        } else {
+            unsigned int out = bit_reader_read(conv->bit_reader, conv->rate);
+            for (unsigned int i = 0; i < 1 << (conv->rate); i++) {
+                distances[i] = metric_distance(i, out);
+            }
+        }
+        pair_lookup_t pair_lookup = conv->pair_lookup;
+        pair_lookup_fill_distance(pair_lookup, distances);
+
+        // a mask to get the high order bit from the shift register
+        unsigned int num_iter = highbit << 1;
+        const distance_t *read_errors = conv->errors->read_errors;
+        // aggregate bit errors for this time slice
+        distance_t *write_errors = conv->errors->write_errors;
+
+        uint8_t *history = history_buffer_get_slice(conv->history_buffer);
+        // walk through all states, ignoring oldest bit
+        // we will track a best register state (path) and the number of bit errors at that path at
+        // this time slice
+        // this loop considers two paths per iteration (high order bit set, clear)
+        // so, it only runs numstates/2 iterations
+        // we'll update the history for every state and find the path with the least aggregated bit
+        // errors
+
+        // now run the main loop
+        // we calculate 2 sets of 2 register states here (4 states per iter)
+        // this creates 2 sets which share a predecessor, and 2 sets which share a successor
+        //
+        // the first set definition is the two states that are the same except for the least order
+        // bit
+        // these two share a predecessor because their high n - 1 bits are the same (differ only by
+        // newest bit)
+        //
+        // the second set definition is the two states that are the same except for the high order
+        // bit
+        // these two share a successor because the oldest high order bit will be shifted out, and
+        // the other bits will be present in the successor
+        //
+        shift_register_t highbase = highbit >> 1;
+        for (shift_register_t low = 0, high = highbit, base = 0; high < num_iter;
+             low += 8, high += 8, base += 4) {
+            // shifted-right ancestors
+            // low and low_plus_one share low_past_error
+            //   note that they are the same when shifted right by 1
+            // same goes for high and high_plus_one
+            for (shift_register_t offset = 0, base_offset = 0; base_offset < 4;
+                 offset += 2, base_offset += 1) {
+                distance_pair_key_t low_key = pair_lookup.keys[base + base_offset];
+                distance_pair_key_t high_key = pair_lookup.keys[highbase + base + base_offset];
+                distance_pair_t low_concat_dist = pair_lookup.distances[low_key];
+                distance_pair_t high_concat_dist = pair_lookup.distances[high_key];
+
+                distance_t low_past_error = read_errors[base + base_offset];
+                distance_t high_past_error = read_errors[highbase + base + base_offset];
+
+                distance_t low_error = (low_concat_dist & 0xffff) + low_past_error;
+                distance_t high_error = (high_concat_dist & 0xffff) + high_past_error;
+
+                shift_register_t successor = low + offset;
+                distance_t error;
+                uint8_t history_mask;
+                if (low_error <= high_error) {
+                    error = low_error;
+                    history_mask = 0;
+                } else {
+                    error = high_error;
+                    history_mask = 1;
+                }
+                write_errors[successor] = error;
+                history[successor] = history_mask;
+
+                shift_register_t low_plus_one = low + offset + 1;
+
+                distance_t low_plus_one_error = (low_concat_dist >> 16) + low_past_error;
+                distance_t high_plus_one_error = (high_concat_dist >> 16) + high_past_error;
+
+                shift_register_t plus_one_successor = low_plus_one;
+                distance_t plus_one_error;
+                uint8_t plus_one_history_mask;
+                if (low_plus_one_error <= high_plus_one_error) {
+                    plus_one_error = low_plus_one_error;
+                    plus_one_history_mask = 0;
+                } else {
+                    plus_one_error = high_plus_one_error;
+                    plus_one_history_mask = 1;
+                }
+                write_errors[plus_one_successor] = plus_one_error;
+                history[plus_one_successor] = plus_one_history_mask;
+            }
+        }
+
+        history_buffer_process(conv->history_buffer, write_errors, conv->bit_writer);
+        error_buffer_swap(conv->errors);
+    }
+}
+
+void convolutional_decode_tail(correct_convolutional *conv, unsigned int sets,
+                               const uint8_t *soft) {
+    // flush state registers
+    // now we only shift in 0s, skipping 1-successors
+    shift_register_t highbit = 1 << (conv->order - 1);
+    for (unsigned int i = sets - conv->order + 1; i < sets; i++) {
+        // lasterrors are the aggregate bit errors for the states of shiftregister for the previous
+        // time slice
+        const distance_t *read_errors = conv->errors->read_errors;
+        // aggregate bit errors for this time slice
+        distance_t *write_errors = conv->errors->write_errors;
+
+        uint8_t *history = history_buffer_get_slice(conv->history_buffer);
+
+        // calculate the distance from all output states to our sliced bits
+        distance_t *distances = conv->distances;
+        if (soft) {
+            if (conv->soft_measurement == CORRECT_SOFT_LINEAR) {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_linear(j, soft + i * conv->rate, conv->rate);
+                }
+            } else {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_quadratic(j, soft + i * conv->rate, conv->rate);
+                }
+            }
+        } else {
+            unsigned int out = bit_reader_read(conv->bit_reader, conv->rate);
+            for (unsigned int i = 0; i < 1 << (conv->rate); i++) {
+                distances[i] = metric_distance(i, out);
+            }
+        }
+        const unsigned int *table = conv->table;
+
+        // a mask to get the high order bit from the shift register
+        unsigned int num_iter = highbit << 1;
+        unsigned int skip = 1 << (conv->order - (sets - i));
+        unsigned int base_skip = skip >> 1;
+
+        shift_register_t highbase = highbit >> 1;
+        for (shift_register_t low = 0, high = highbit, base = 0; high < num_iter;
+             low += skip, high += skip, base += base_skip) {
+            unsigned int low_output = table[low];
+            unsigned int high_output = table[high];
+            distance_t low_dist = distances[low_output];
+            distance_t high_dist = distances[high_output];
+
+            distance_t low_past_error = read_errors[base];
+            distance_t high_past_error = read_errors[highbase + base];
+
+            distance_t low_error = low_dist + low_past_error;
+            distance_t high_error = high_dist + high_past_error;
+
+            shift_register_t successor = low;
+            distance_t error;
+            uint8_t history_mask;
+            if (low_error < high_error) {
+                error = low_error;
+                history_mask = 0;
+            } else {
+                error = high_error;
+                history_mask = 1;
+            }
+            write_errors[successor] = error;
+            history[successor] = history_mask;
+        }
+
+        history_buffer_process_skip(conv->history_buffer, write_errors, conv->bit_writer, skip);
+        error_buffer_swap(conv->errors);
+    }
+}
+
+void _convolutional_decode_init(correct_convolutional *conv, unsigned int min_traceback,
+                                unsigned int traceback_length, unsigned int renormalize_interval) {
+    conv->has_init_decode = true;
+
+    conv->distances = calloc(1 << (conv->rate), sizeof(distance_t));
+    conv->pair_lookup = pair_lookup_create(conv->rate, conv->order, conv->table);
+
+    conv->soft_measurement = CORRECT_SOFT_LINEAR;
+
+    // we limit history to go back as far as 5 * the order of our polynomial
+    conv->history_buffer = history_buffer_create(min_traceback, traceback_length, renormalize_interval,
+                                                 conv->numstates / 2, 1 << (conv->order - 1));
+
+    conv->errors = error_buffer_create(conv->numstates);
+}
+
+static ssize_t _convolutional_decode(correct_convolutional *conv, size_t num_encoded_bits,
+                                     size_t num_encoded_bytes, uint8_t *msg,
+                                     const soft_t *soft_encoded) {
+    if (!conv->has_init_decode) {
+        uint64_t max_error_per_input = conv->rate * soft_max;
+        unsigned int renormalize_interval = distance_max / max_error_per_input;
+        _convolutional_decode_init(conv, 5 * conv->order, 15 * conv->order, renormalize_interval);
+    }
+
+    size_t sets = num_encoded_bits / conv->rate;
+    // XXX fix this vvvvvv
+    size_t decoded_len_bytes = num_encoded_bytes;
+    bit_writer_reconfigure(conv->bit_writer, msg, decoded_len_bytes);
+
+    error_buffer_reset(conv->errors);
+    history_buffer_reset(conv->history_buffer);
+
+    // no outputs are generated during warmup
+    convolutional_decode_warmup(conv, sets, soft_encoded);
+    convolutional_decode_inner(conv, sets, soft_encoded);
+    convolutional_decode_tail(conv, sets, soft_encoded);
+
+    history_buffer_flush(conv->history_buffer, conv->bit_writer);
+
+    return bit_writer_length(conv->bit_writer);
+}
+
+// perform viterbi decoding
+// hard decoder
+ssize_t correct_convolutional_decode(correct_convolutional *conv, const uint8_t *encoded,
+                                     size_t num_encoded_bits, uint8_t *msg) {
+    if (num_encoded_bits % conv->rate) {
+        // XXX turn this into an error code
+        // printf("encoded length of message must be a multiple of rate\n");
+        return -1;
+    }
+
+    size_t num_encoded_bytes =
+        (num_encoded_bits % 8) ? (num_encoded_bits / 8 + 1) : (num_encoded_bits / 8);
+    bit_reader_reconfigure(conv->bit_reader, encoded, num_encoded_bytes);
+
+    return _convolutional_decode(conv, num_encoded_bits, num_encoded_bytes, msg, NULL);
+}
+
+ssize_t correct_convolutional_decode_soft(correct_convolutional *conv, const soft_t *encoded,
+                                          size_t num_encoded_bits, uint8_t *msg) {
+    if (num_encoded_bits % conv->rate) {
+        // XXX turn this into an error code
+        // printf("encoded length of message must be a multiple of rate\n");
+        return -1;
+    }
+
+    size_t num_encoded_bytes =
+        (num_encoded_bits % 8) ? (num_encoded_bits / 8 + 1) : (num_encoded_bits / 8);
+
+    return _convolutional_decode(conv, num_encoded_bits, num_encoded_bytes, msg, encoded);
+}
diff --git a/falcon9_decoder/src/libcorrect/convolutional/encode.c b/falcon9_decoder/src/libcorrect/convolutional/encode.c
new file mode 100644
index 00000000..5041262b
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/convolutional/encode.c
@@ -0,0 +1,61 @@
+#include "correct/convolutional/convolutional.h"
+
+size_t correct_convolutional_encode_len(correct_convolutional *conv, size_t msg_len) {
+    size_t msgbits = 8 * msg_len;
+    size_t encodedbits = conv->rate * (msgbits + conv->order + 1);
+    return encodedbits;
+}
+
+// shift in most significant bit every time, one byte at a time
+// shift register takes most recent bit on right, shifts left
+// poly is written in same order, just & mask message w/ poly
+
+// assume that encoded length is long enough?
+size_t correct_convolutional_encode(correct_convolutional *conv,
+                                    const uint8_t *msg,
+                                    size_t msg_len,
+                                    uint8_t *encoded) {
+    // convolutional code convolves filter coefficients, given by
+    //     the polynomial, with some history from our message.
+    //     the history is stored as single subsequent bits in shiftregister
+    shift_register_t shiftregister = 0;
+
+    // shiftmask is the shiftregister bit mask that removes bits
+    //      that extend beyond order
+    // e.g. if order is 7, then remove the 8th bit and beyond
+    unsigned int shiftmask = (1 << conv->order) - 1;
+
+    size_t encoded_len_bits = correct_convolutional_encode_len(conv, msg_len);
+    size_t encoded_len = (encoded_len_bits % 8) ? (encoded_len_bits / 8 + 1) : (encoded_len_bits / 8);
+    bit_writer_reconfigure(conv->bit_writer, encoded, encoded_len);
+
+    bit_reader_reconfigure(conv->bit_reader, msg, msg_len);
+
+    for (size_t i = 0; i < 8 * msg_len; i++) {
+        // shiftregister has oldest bits on left, newest on right
+        shiftregister <<= 1;
+        shiftregister |= bit_reader_read(conv->bit_reader, 1);
+        shiftregister &= shiftmask;
+        // shift most significant bit from byte and move down one bit at a time
+
+        // we do direct lookup of our convolutional output here
+        // all of the bits from this convolution are stored in this row
+        unsigned int out = conv->table[shiftregister];
+        bit_writer_write(conv->bit_writer, out, conv->rate);
+    }
+
+    // now flush the shiftregister
+    // this is simply running the loop as above but without any new inputs
+    // or rather, the new input string is all 0s
+    for (size_t i = 0; i < conv->order + 1; i++) {
+        shiftregister <<= 1;
+        shiftregister &= shiftmask;
+        unsigned int out = conv->table[shiftregister];
+        bit_writer_write(conv->bit_writer, out, conv->rate);
+    }
+
+    // 0-fill any remaining bits on our final byte
+    bit_writer_flush_byte(conv->bit_writer);
+
+    return encoded_len_bits;
+}
diff --git a/falcon9_decoder/src/libcorrect/convolutional/error_buffer.c b/falcon9_decoder/src/libcorrect/convolutional/error_buffer.c
new file mode 100644
index 00000000..a5fc0ab5
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/convolutional/error_buffer.c
@@ -0,0 +1,43 @@
+#include "correct/convolutional/error_buffer.h"
+
+error_buffer_t *error_buffer_create(unsigned int num_states) {
+    error_buffer_t *buf = calloc(1, sizeof(error_buffer_t));
+
+    // how large are the error buffers?
+    buf->num_states = num_states;
+
+    // save two error metrics, one for last round and one for this
+    // (double buffer)
+    // the error metric is the aggregated number of bit errors found
+    //   at a given path which terminates at a particular shift register state
+    buf->errors[0] = calloc(sizeof(distance_t), num_states);
+    buf->errors[1] = calloc(sizeof(distance_t), num_states);
+
+    // which buffer are we using, 0 or 1?
+    buf->index = 0;
+
+    buf->read_errors = buf->errors[0];
+    buf->write_errors = buf->errors[1];
+
+    return buf;
+}
+
+void error_buffer_destroy(error_buffer_t *buf) {
+    free(buf->errors[0]);
+    free(buf->errors[1]);
+    free(buf);
+}
+
+void error_buffer_reset(error_buffer_t *buf) {
+    memset(buf->errors[0], 0, buf->num_states * sizeof(distance_t));
+    memset(buf->errors[1], 0, buf->num_states * sizeof(distance_t));
+    buf->index = 0;
+    buf->read_errors = buf->errors[0];
+    buf->write_errors = buf->errors[1];
+}
+
+void error_buffer_swap(error_buffer_t *buf) {
+    buf->read_errors = buf->errors[buf->index];
+    buf->index = (buf->index + 1) % 2;
+    buf->write_errors = buf->errors[buf->index];
+}
diff --git a/falcon9_decoder/src/libcorrect/convolutional/history_buffer.c b/falcon9_decoder/src/libcorrect/convolutional/history_buffer.c
new file mode 100644
index 00000000..f54ffdd9
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/convolutional/history_buffer.c
@@ -0,0 +1,158 @@
+#include "correct/convolutional/history_buffer.h"
+
+history_buffer *history_buffer_create(unsigned int min_traceback_length,
+                                      unsigned int traceback_group_length,
+                                      unsigned int renormalize_interval, unsigned int num_states,
+                                      shift_register_t highbit) {
+    history_buffer *buf = calloc(1, sizeof(history_buffer));
+
+    *(unsigned int *)&buf->min_traceback_length = min_traceback_length;
+    *(unsigned int *)&buf->traceback_group_length = traceback_group_length;
+    *(unsigned int *)&buf->cap = min_traceback_length + traceback_group_length;
+    *(unsigned int *)&buf->num_states = num_states;
+    *(shift_register_t *)&buf->highbit = highbit;
+
+    buf->history = malloc(buf->cap * sizeof(uint8_t *));
+    for (unsigned int i = 0; i < buf->cap; i++) {
+        buf->history[i] = calloc(num_states, sizeof(uint8_t));
+    }
+    buf->fetched = malloc(buf->cap * sizeof(uint8_t));
+
+    buf->index = 0;
+    buf->len = 0;
+
+    buf->renormalize_counter = 0;
+    buf->renormalize_interval = renormalize_interval;
+
+    return buf;
+}
+
+void history_buffer_destroy(history_buffer *buf) {
+    for (unsigned int i = 0; i < buf->cap; i++) {
+        free(buf->history[i]);
+    }
+    free(buf->history);
+    free(buf->fetched);
+    free(buf);
+}
+
+void history_buffer_reset(history_buffer *buf) {
+    buf->len = 0;
+    buf->index = 0;
+}
+
+uint8_t *history_buffer_get_slice(history_buffer *buf) { return buf->history[buf->index]; }
+
+shift_register_t history_buffer_search(history_buffer *buf, const distance_t *distances,
+                                       unsigned int search_every) {
+    shift_register_t bestpath;
+    distance_t leasterror = USHRT_MAX;
+    // search for a state with the least error
+    for (shift_register_t state = 0; state < buf->num_states; state += search_every) {
+        if (distances[state] < leasterror) {
+            leasterror = distances[state];
+            bestpath = state;
+        }
+    }
+    return bestpath;
+}
+
+void history_buffer_renormalize(history_buffer *buf, distance_t *distances,
+                                shift_register_t min_register) {
+    distance_t min_distance = distances[min_register];
+    for (shift_register_t i = 0; i < buf->num_states; i++) {
+        distances[i] -= min_distance;
+    }
+}
+
+void history_buffer_traceback(history_buffer *buf, shift_register_t bestpath,
+                              unsigned int min_traceback_length, bit_writer_t *output) {
+    unsigned int fetched_index = 0;
+    shift_register_t highbit = buf->highbit;
+    unsigned int index = buf->index;
+    unsigned int cap = buf->cap;
+    for (unsigned int j = 0; j < min_traceback_length; j++) {
+        if (index == 0) {
+            index = cap - 1;
+        } else {
+            index--;
+        }
+        // we're walking backwards from what the work we did before
+        // so, we'll shift high order bits in
+        // the path will cross multiple different shift register states, and we determine
+        //   which state by going backwards one time slice at a time
+        uint8_t history = buf->history[index][bestpath];
+        shift_register_t pathbit = history ? highbit : 0;
+        bestpath |= pathbit;
+        bestpath >>= 1;
+    }
+    unsigned int prefetch_index = index;
+    if (prefetch_index == 0) {
+        prefetch_index = cap - 1;
+    } else {
+        prefetch_index--;
+    }
+    unsigned int len = buf->len;
+    for (unsigned int j = min_traceback_length; j < len; j++) {
+        index = prefetch_index;
+        if (prefetch_index == 0) {
+            prefetch_index = cap - 1;
+        } else {
+            prefetch_index--;
+        }
+        prefetch(buf->history[prefetch_index]);
+        // we're walking backwards from what the work we did before
+        // so, we'll shift high order bits in
+        // the path will cross multiple different shift register states, and we determine
+        //   which state by going backwards one time slice at a time
+        uint8_t history = buf->history[index][bestpath];
+        shift_register_t pathbit = history ? highbit : 0;
+        bestpath |= pathbit;
+        bestpath >>= 1;
+        buf->fetched[fetched_index] = (pathbit ? 1 : 0);
+        fetched_index++;
+    }
+    bit_writer_write_bitlist_reversed(output, buf->fetched, fetched_index);
+    buf->len -= fetched_index;
+}
+
+void history_buffer_process_skip(history_buffer *buf, distance_t *distances, bit_writer_t *output,
+                                 unsigned int skip) {
+    buf->index++;
+    if (buf->index == buf->cap) {
+        buf->index = 0;
+    }
+
+    buf->renormalize_counter++;
+    buf->len++;
+
+    // there are four ways these branches can resolve
+    // a) we are neither renormalizing nor doing a traceback
+    // b) we are renormalizing but not doing a traceback
+    // c) we are renormalizing and doing a traceback
+    // d) we are not renormalizing but we are doing a traceback
+    // in case c, we want to save the effort of finding the bestpath
+    //    since that's expensive
+    // so we have to check for that case after we renormalize
+    if (buf->renormalize_counter == buf->renormalize_interval) {
+        buf->renormalize_counter = 0;
+        shift_register_t bestpath = history_buffer_search(buf, distances, skip);
+        history_buffer_renormalize(buf, distances, bestpath);
+        if (buf->len == buf->cap) {
+            // reuse the bestpath found for renormalizing
+            history_buffer_traceback(buf, bestpath, buf->min_traceback_length, output);
+        }
+    } else if (buf->len == buf->cap) {
+        // not renormalizing, find the bestpath here
+        shift_register_t bestpath = history_buffer_search(buf, distances, skip);
+        history_buffer_traceback(buf, bestpath, buf->min_traceback_length, output);
+    }
+}
+
+void history_buffer_process(history_buffer *buf, distance_t *distances, bit_writer_t *output) {
+    history_buffer_process_skip(buf, distances, output, 1);
+}
+
+void history_buffer_flush(history_buffer *buf, bit_writer_t *output) {
+    history_buffer_traceback(buf, 0, 0, output);
+}
diff --git a/falcon9_decoder/src/libcorrect/convolutional/lookup.c b/falcon9_decoder/src/libcorrect/convolutional/lookup.c
new file mode 100644
index 00000000..8c96aae6
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/convolutional/lookup.c
@@ -0,0 +1,74 @@
+#include "correct/convolutional/lookup.h"
+
+// table has numstates rows
+// each row contains all of the polynomial output bits concatenated together
+// e.g. for rate 2, we have 2 bits in each row
+// the first poly gets the LEAST significant bit, last poly gets most significant
+void fill_table(unsigned int rate,
+                unsigned int order,
+                const polynomial_t *poly,
+                unsigned int *table) {
+    for (shift_register_t i = 0; i < 1 << order; i++) {
+        unsigned int out = 0;
+        unsigned int mask = 1;
+        for (size_t j = 0; j < rate; j++) {
+            out |= (popcount(i & poly[j]) % 2) ? mask : 0;
+            mask <<= 1;
+        }
+        table[i] = out;
+    }
+}
+
+pair_lookup_t pair_lookup_create(unsigned int rate,
+                                 unsigned int order,
+                                 const unsigned int *table) {
+    pair_lookup_t pairs;
+
+    pairs.keys = malloc(sizeof(unsigned int) * (1 << (order - 1)));
+    pairs.outputs = calloc((1 << (rate * 2)), sizeof(unsigned int));
+    unsigned int *inv_outputs = calloc((1 << (rate * 2)), sizeof(unsigned int));
+    unsigned int output_counter = 1;
+    // for every (even-numbered) shift register state, find the concatenated output of the state
+    //   and the subsequent state that follows it (low bit set). then, check to see if this
+    //   concatenated output has a unique key assigned to it already. if not, give it a key.
+    //   if it does, retrieve the key. assign this key to the shift register state.
+    for (unsigned int i = 0; i < (1 << (order - 1)); i++) {
+        // first get the concatenated pair of outputs
+        unsigned int out = table[i * 2 + 1];
+        out <<= rate;
+        out |= table[i * 2];
+
+        // does this concatenated output exist in the outputs table yet?
+        if (!inv_outputs[out]) {
+            // doesn't exist, allocate a new key
+            inv_outputs[out] = output_counter;
+            pairs.outputs[output_counter] = out;
+            output_counter++;
+        }
+        // set the opaque key for the ith shift register state to the concatenated output entry
+        pairs.keys[i] = inv_outputs[out];
+    }
+    pairs.outputs_len = output_counter;
+    pairs.output_mask = (1 << (rate)) - 1;
+    pairs.output_width = rate;
+    pairs.distances = calloc(pairs.outputs_len, sizeof(distance_pair_t));
+    free(inv_outputs);
+    return pairs;
+}
+
+void pair_lookup_destroy(pair_lookup_t pairs) {
+    free(pairs.keys);
+    free(pairs.outputs);
+    free(pairs.distances);
+}
+
+void pair_lookup_fill_distance(pair_lookup_t pairs, distance_t *distances) {
+    for (unsigned int i = 1; i < pairs.outputs_len; i += 1) {
+        output_pair_t concat_out = pairs.outputs[i];
+        unsigned int i_0 = concat_out & pairs.output_mask;
+        concat_out >>= pairs.output_width;
+        unsigned int i_1 = concat_out;
+
+        pairs.distances[i] = (distances[i_1] << 16) | distances[i_0];
+    }
+}
diff --git a/falcon9_decoder/src/libcorrect/convolutional/metric.c b/falcon9_decoder/src/libcorrect/convolutional/metric.c
new file mode 100644
index 00000000..894db4d9
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/convolutional/metric.c
@@ -0,0 +1,17 @@
+#include "correct/convolutional/metric.h"
+
+// measure the square of the euclidean distance between x and y
+// since euclidean dist is sqrt(a^2 + b^2 + ... + n^2), the square is just
+//    a^2 + b^2 + ... + n^2
+distance_t metric_soft_distance_quadratic(unsigned int hard_x, const uint8_t *soft_y, size_t len) {
+    distance_t dist = 0;
+    for (unsigned int i = 0; i < len; i++) {
+        // first, convert hard_x to a soft measurement (0 -> 0, 1 - > 255)
+        unsigned int soft_x = (hard_x & 1) ? 255 : 0;
+        hard_x >>= 1;
+        int d = soft_y[i] - soft_x;
+        dist += d*d;
+    }
+    return dist >> 3;
+}
+
diff --git a/falcon9_decoder/src/libcorrect/convolutional/sse/CMakeLists.txt b/falcon9_decoder/src/libcorrect/convolutional/sse/CMakeLists.txt
new file mode 100644
index 00000000..0d0ade9a
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/convolutional/sse/CMakeLists.txt
@@ -0,0 +1,2 @@
+set(SRCFILES lookup.c convolutional.c encode.c decode.c)
+add_library(correct-convolutional-sse OBJECT ${SRCFILES})
diff --git a/falcon9_decoder/src/libcorrect/convolutional/sse/convolutional.c b/falcon9_decoder/src/libcorrect/convolutional/sse/convolutional.c
new file mode 100644
index 00000000..484c5c6e
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/convolutional/sse/convolutional.c
@@ -0,0 +1,21 @@
+#include "correct/convolutional/sse/convolutional.h"
+
+correct_convolutional_sse *correct_convolutional_sse_create(size_t rate,
+                                                            size_t order,
+                                                            const polynomial_t *poly) {
+    correct_convolutional_sse *conv = malloc(sizeof(correct_convolutional_sse));
+    correct_convolutional *init_conv = _correct_convolutional_init(&conv->base_conv, rate, order, poly);
+    if (!init_conv) {
+        free(conv);
+        conv = NULL;
+    }
+    return conv;
+}
+
+void correct_convolutional_sse_destroy(correct_convolutional_sse *conv) {
+    if (conv->base_conv.has_init_decode) {
+        oct_lookup_destroy(conv->oct_lookup);
+    }
+    _correct_convolutional_teardown(&conv->base_conv);
+    free(conv);
+}
diff --git a/falcon9_decoder/src/libcorrect/convolutional/sse/decode.c b/falcon9_decoder/src/libcorrect/convolutional/sse/decode.c
new file mode 100644
index 00000000..0f6bcf5b
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/convolutional/sse/decode.c
@@ -0,0 +1,319 @@
+#include "correct/convolutional/sse/convolutional.h"
+
+static void convolutional_sse_decode_inner(correct_convolutional_sse *sse_conv, unsigned int sets,
+                                           const uint8_t *soft) {
+    correct_convolutional *conv = &sse_conv->base_conv;
+    shift_register_t highbit = 1 << (conv->order - 1);
+    unsigned int hist_buf_index = conv->history_buffer->index;
+    unsigned int hist_buf_cap = conv->history_buffer->cap;
+    unsigned int hist_buf_len = conv->history_buffer->len;
+    unsigned int hist_buf_rn_int = conv->history_buffer->renormalize_interval;
+    unsigned int hist_buf_rn_cnt = conv->history_buffer->renormalize_counter;
+    for (unsigned int i = conv->order - 1; i < (sets - conv->order + 1); i++) {
+        distance_t *distances = conv->distances;
+        // lasterrors are the aggregate bit errors for the states of
+        // shiftregister for the previous time slice
+        if (soft) {
+            if (conv->soft_measurement == CORRECT_SOFT_LINEAR) {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_linear(j, soft + i * conv->rate, conv->rate);
+                }
+            } else {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_quadratic(j, soft + i * conv->rate, conv->rate);
+                }
+            }
+        } else {
+            unsigned int out = bit_reader_read(conv->bit_reader, conv->rate);
+            for (unsigned int i = 0; i < 1 << (conv->rate); i++) {
+                distances[i] = metric_distance(i, out);
+            }
+        }
+        oct_lookup_t oct_lookup = sse_conv->oct_lookup;
+        oct_lookup_fill_distance(oct_lookup, distances);
+
+        // a mask to get the high order bit from the shift register
+        unsigned int num_iter = highbit << 1;
+        const distance_t *read_errors = conv->errors->read_errors;
+        // aggregate bit errors for this time slice
+        distance_t *write_errors = conv->errors->write_errors;
+
+        uint8_t *history = conv->history_buffer->history[hist_buf_index];
+        ;
+        // walk through all states, ignoring oldest bit
+        // we will track a best register state (path) and the number of bit
+        // errors at that path at this time slice
+        // this loop considers two paths per iteration (high order bit set,
+        // clear)
+        // so, it only runs numstates/2 iterations
+        // we'll update the history for every state and find the path with the
+        // least aggregated bit errors
+
+        // now run the main loop
+        // we calculate 2 sets of 2 register states here (4 states per iter)
+        // this creates 2 sets which share a predecessor, and 2 sets which share
+        // a successor
+        //
+        // the first set definition is the two states that are the same except
+        // for the least order bit
+        // these two share a predecessor because their high n - 1 bits are the
+        // same (differ only by newest bit)
+        //
+        // the second set definition is the two states that are the same except
+        // for the high order bit
+        // these two share a successor because the oldest high order bit will be
+        // shifted out, and the other bits will be present in the successor
+        //
+        shift_register_t highbase = highbit >> 1;
+        shift_register_t oct_highbase = highbase >> 2;
+        for (shift_register_t low = 0, high = highbit, base = 0, oct = 0; high < num_iter;
+             low += 32, high += 32, base += 16, oct += 4) {
+            // shifted-right ancestors
+            // low and low_plus_one share low_past_error
+            //   note that they are the same when shifted right by 1
+            // same goes for high and high_plus_one
+            __m128i past_shuffle_mask =
+                _mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100);
+            __m128i hist_mask =
+                _mm_set_epi32(0x80808080, 0x80808080, 0x0e0c0a09, 0x07050301);
+
+            // the loop below calculates 64 register states per loop iteration
+            // it does this by packing the 128-bit xmm registers with 8, 16-bit
+            // distances
+            // 4 of these registers hold distances for convolutional shift
+            // register states with the high bit cleared
+            //      and 4 hold distances for the corresponding shift register
+            //      states with the high bit set
+            // since each xmm register holds 8 distances, this adds up to a
+            // total of 8 * 8 = 64 shift register states
+            for (shift_register_t offset = 0, base_offset = 0; base_offset < 16;
+                 offset += 32, base_offset += 16) {
+                // load the past error for the register states with the high
+                // order bit cleared
+                __m128i low_past_error =
+                    _mm_loadl_epi64((const __m128i *)(read_errors + base + base_offset));
+                __m128i low_past_error0 =
+                    _mm_loadl_epi64((const __m128i *)(read_errors + base + base_offset + 4));
+                __m128i low_past_error1 =
+                    _mm_loadl_epi64((const __m128i *)(read_errors + base + base_offset + 8));
+                __m128i low_past_error2 =
+                    _mm_loadl_epi64((const __m128i *)(read_errors + base + base_offset + 12));
+
+                // shuffle the low past error
+                // register states that differ only by their low order bit share
+                // a past error
+                low_past_error = _mm_shuffle_epi8(low_past_error, past_shuffle_mask);
+                low_past_error0 = _mm_shuffle_epi8(low_past_error0, past_shuffle_mask);
+                low_past_error1 = _mm_shuffle_epi8(low_past_error1, past_shuffle_mask);
+                low_past_error2 = _mm_shuffle_epi8(low_past_error2, past_shuffle_mask);
+
+                // repeat past error lookup for register states with high order
+                // bit set
+                __m128i high_past_error =
+                    _mm_loadl_epi64((const __m128i *)(read_errors + highbase + base + base_offset));
+                __m128i high_past_error0 = _mm_loadl_epi64(
+                    (const __m128i *)(read_errors + highbase + base + base_offset + 4));
+                __m128i high_past_error1 = _mm_loadl_epi64(
+                    (const __m128i *)(read_errors + highbase + base + base_offset + 8));
+                __m128i high_past_error2 = _mm_loadl_epi64(
+                    (const __m128i *)(read_errors + highbase + base + base_offset + 12));
+
+                high_past_error = _mm_shuffle_epi8(high_past_error, past_shuffle_mask);
+                high_past_error0 = _mm_shuffle_epi8(high_past_error0, past_shuffle_mask);
+                high_past_error1 = _mm_shuffle_epi8(high_past_error1, past_shuffle_mask);
+                high_past_error2 = _mm_shuffle_epi8(high_past_error2, past_shuffle_mask);
+
+                // __m128i this_shuffle_mask = (__m128i){0x80800100, 0x80800302,
+                // 0x80800504, 0x80800706};
+
+                // load the opaque oct distance table keys from out loop index
+                distance_oct_key_t low_key = oct_lookup.keys[oct + (base_offset / 4)];
+                distance_oct_key_t low_key0 = oct_lookup.keys[oct + (base_offset / 4) + 1];
+                distance_oct_key_t low_key1 = oct_lookup.keys[oct + (base_offset / 4) + 2];
+                distance_oct_key_t low_key2 = oct_lookup.keys[oct + (base_offset / 4) + 3];
+
+                // load the distances for the register states with high order
+                // bit cleared
+                __m128i low_this_error =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + low_key));
+                __m128i low_this_error0 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + low_key0));
+                __m128i low_this_error1 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + low_key1));
+                __m128i low_this_error2 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + low_key2));
+
+                // add the distance for this time slice to the past distances
+                __m128i low_error = _mm_add_epi16(low_past_error, low_this_error);
+                __m128i low_error0 = _mm_add_epi16(low_past_error0, low_this_error0);
+                __m128i low_error1 = _mm_add_epi16(low_past_error1, low_this_error1);
+                __m128i low_error2 = _mm_add_epi16(low_past_error2, low_this_error2);
+
+                // repeat oct distance table lookup for registers with high
+                // order bit set
+                distance_oct_key_t high_key =
+                    oct_lookup.keys[oct_highbase + oct + (base_offset / 4)];
+                distance_oct_key_t high_key0 =
+                    oct_lookup.keys[oct_highbase + oct + (base_offset / 4) + 1];
+                distance_oct_key_t high_key1 =
+                    oct_lookup.keys[oct_highbase + oct + (base_offset / 4) + 2];
+                distance_oct_key_t high_key2 =
+                    oct_lookup.keys[oct_highbase + oct + (base_offset / 4) + 3];
+
+                __m128i high_this_error =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + high_key));
+                __m128i high_this_error0 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + high_key0));
+                __m128i high_this_error1 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + high_key1));
+                __m128i high_this_error2 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + high_key2));
+
+                __m128i high_error = _mm_add_epi16(high_past_error, high_this_error);
+                __m128i high_error0 = _mm_add_epi16(high_past_error0, high_this_error0);
+                __m128i high_error1 = _mm_add_epi16(high_past_error1, high_this_error1);
+                __m128i high_error2 = _mm_add_epi16(high_past_error2, high_this_error2);
+
+                // distances for this time slice calculated
+
+                // find the least error between registers who differ only in
+                // their high order bit
+                __m128i min_error = _mm_min_epu16(low_error, high_error);
+                __m128i min_error0 = _mm_min_epu16(low_error0, high_error0);
+                __m128i min_error1 = _mm_min_epu16(low_error1, high_error1);
+                __m128i min_error2 = _mm_min_epu16(low_error2, high_error2);
+
+                _mm_store_si128((__m128i *)(write_errors + low + offset), min_error);
+                _mm_store_si128((__m128i *)(write_errors + low + offset + 8), min_error0);
+                _mm_store_si128((__m128i *)(write_errors + low + offset + 16), min_error1);
+                _mm_store_si128((__m128i *)(write_errors + low + offset + 24), min_error2);
+
+                // generate history bits as (low_error > least_error)
+                // this operation fills each element with all 1s if true and 0s
+                // if false
+                // in other words, we set the history bit to 1 if
+                //      the register state with high order bit set was the least
+                //      error
+                __m128i hist = _mm_cmpgt_epi16(low_error, min_error);
+                // pack the bits down from 16-bit wide to 8-bit wide to
+                // accomodate history table
+                hist = _mm_shuffle_epi8(hist, hist_mask);
+
+                __m128i hist0 = _mm_cmpgt_epi16(low_error0, min_error0);
+                hist0 = _mm_shuffle_epi8(hist0, hist_mask);
+
+                __m128i hist1 = _mm_cmpgt_epi16(low_error1, min_error1);
+                hist1 = _mm_shuffle_epi8(hist1, hist_mask);
+
+                __m128i hist2 = _mm_cmpgt_epi16(low_error2, min_error2);
+                hist2 = _mm_shuffle_epi8(hist2, hist_mask);
+
+                // write the least error so that the next time slice sees it as
+                // the past error
+                // store the history bits set by cmp and shuffle operations
+                _mm_storel_epi64((__m128i *)(history + low + offset), hist);
+                _mm_storel_epi64((__m128i *)(history + low + offset + 8), hist0);
+                _mm_storel_epi64((__m128i *)(history + low + offset + 16), hist1);
+                _mm_storel_epi64((__m128i *)(history + low + offset + 24), hist2);
+            }
+        }
+
+        // bypass the call to history buffer
+        // we should really make that function inline and remove this below
+        if (hist_buf_len == hist_buf_cap - 1 || hist_buf_rn_cnt == hist_buf_rn_int - 1) {
+            // restore hist buffer state and invoke it
+            conv->history_buffer->len = hist_buf_len;
+            conv->history_buffer->index = hist_buf_index;
+            conv->history_buffer->renormalize_counter = hist_buf_rn_cnt;
+            history_buffer_process(conv->history_buffer, write_errors, conv->bit_writer);
+            // restore our local values
+            hist_buf_len = conv->history_buffer->len;
+            hist_buf_index = conv->history_buffer->index;
+            hist_buf_cap = conv->history_buffer->cap;
+            hist_buf_rn_cnt = conv->history_buffer->renormalize_counter;
+        } else {
+            hist_buf_len++;
+            hist_buf_index++;
+            if (hist_buf_index == hist_buf_cap) {
+                hist_buf_index = 0;
+            }
+            hist_buf_rn_cnt++;
+        }
+        error_buffer_swap(conv->errors);
+    }
+    conv->history_buffer->len = hist_buf_len;
+    conv->history_buffer->index = hist_buf_index;
+    conv->history_buffer->renormalize_counter = hist_buf_rn_cnt;
+}
+
+static void _convolutional_sse_decode_init(correct_convolutional_sse *conv,
+                                           unsigned int min_traceback,
+                                           unsigned int traceback_length,
+                                           unsigned int renormalize_interval) {
+    _convolutional_decode_init(&conv->base_conv, min_traceback, traceback_length,
+                               renormalize_interval);
+    conv->oct_lookup =
+        oct_lookup_create(conv->base_conv.rate, conv->base_conv.order, conv->base_conv.table);
+}
+
+static ssize_t _convolutional_sse_decode(correct_convolutional_sse *sse_conv,
+                                         size_t num_encoded_bits, size_t num_encoded_bytes,
+                                         uint8_t *msg, const soft_t *soft_encoded) {
+    correct_convolutional *conv = &sse_conv->base_conv;
+    if (!conv->has_init_decode) {
+        uint64_t max_error_per_input = conv->rate * soft_max;
+        // sse implementation unfortunately uses signed math on our unsigned values
+        // reduces usable distance by /2
+        unsigned int renormalize_interval = (distance_max / 2) / max_error_per_input;
+        _convolutional_sse_decode_init(sse_conv, 5 * conv->order, 100 * conv->order,
+                                       renormalize_interval);
+    }
+
+    size_t sets = num_encoded_bits / conv->rate;
+    // XXX fix this vvvvvv
+    size_t decoded_len_bytes = num_encoded_bytes;
+    bit_writer_reconfigure(conv->bit_writer, msg, decoded_len_bytes);
+
+    error_buffer_reset(conv->errors);
+    history_buffer_reset(conv->history_buffer);
+
+    // no outputs are generated during warmup
+    convolutional_decode_warmup(conv, sets, soft_encoded);
+    convolutional_sse_decode_inner(sse_conv, sets, soft_encoded);
+    convolutional_decode_tail(conv, sets, soft_encoded);
+
+    history_buffer_flush(conv->history_buffer, conv->bit_writer);
+
+    return bit_writer_length(conv->bit_writer);
+}
+
+ssize_t correct_convolutional_sse_decode(correct_convolutional_sse *conv, const uint8_t *encoded,
+                                         size_t num_encoded_bits, uint8_t *msg) {
+    if (num_encoded_bits % conv->base_conv.rate) {
+        // XXX turn this into an error code
+        // printf("encoded length of message must be a multiple of rate\n");
+        return -1;
+    }
+
+    size_t num_encoded_bytes =
+        (num_encoded_bits % 8) ? (num_encoded_bits / 8 + 1) : (num_encoded_bits / 8);
+    bit_reader_reconfigure(conv->base_conv.bit_reader, encoded, num_encoded_bytes);
+
+    return _convolutional_sse_decode(conv, num_encoded_bits, num_encoded_bytes, msg, NULL);
+}
+
+ssize_t correct_convolutional_sse_decode_soft(correct_convolutional_sse *conv, const soft_t *encoded,
+                                              size_t num_encoded_bits, uint8_t *msg) {
+    if (num_encoded_bits % conv->base_conv.rate) {
+        // XXX turn this into an error code
+        // printf("encoded length of message must be a multiple of rate\n");
+        return -1;
+    }
+
+    size_t num_encoded_bytes =
+        (num_encoded_bits % 8) ? (num_encoded_bits / 8 + 1) : (num_encoded_bits / 8);
+
+    return _convolutional_sse_decode(conv, num_encoded_bits, num_encoded_bytes, msg, encoded);
+}
diff --git a/falcon9_decoder/src/libcorrect/convolutional/sse/encode.c b/falcon9_decoder/src/libcorrect/convolutional/sse/encode.c
new file mode 100644
index 00000000..92ea10df
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/convolutional/sse/encode.c
@@ -0,0 +1,9 @@
+#include "correct/convolutional/sse/convolutional.h"
+
+size_t correct_convolutional_sse_encode_len(correct_convolutional_sse *conv, size_t msg_len) {
+    return correct_convolutional_encode_len(&conv->base_conv, msg_len);
+}
+
+size_t correct_convolutional_sse_encode(correct_convolutional_sse *conv, const uint8_t *msg, size_t msg_len, uint8_t *encoded) {
+    return correct_convolutional_encode(&conv->base_conv, msg, msg_len, encoded);
+}
diff --git a/falcon9_decoder/src/libcorrect/convolutional/sse/lookup.c b/falcon9_decoder/src/libcorrect/convolutional/sse/lookup.c
new file mode 100644
index 00000000..472dd8f9
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/convolutional/sse/lookup.c
@@ -0,0 +1,183 @@
+#include "correct/convolutional/sse/lookup.h"
+
+quad_lookup_t quad_lookup_create(unsigned int rate,
+                                 unsigned int order,
+                                 const unsigned int *table) {
+    quad_lookup_t quads;
+
+    quads.keys = malloc(sizeof(unsigned int) * (1 << (order - 2)));
+    quads.outputs = calloc((1 << (rate * 4)), sizeof(unsigned int));
+    unsigned int *inv_outputs = calloc((1 << (rate * 4)), sizeof(unsigned int));
+    unsigned int output_counter = 1;
+    // for every (even-numbered) shift register state, find the concatenated output of the state
+    //   and the subsequent state that follows it (low bit set). then, check to see if this
+    //   concatenated output has a unique key assigned to it already. if not, give it a key.
+    //   if it does, retrieve the key. assign this key to the shift register state.
+    for (unsigned int i = 0; i < (1 << (order - 2)); i++) {
+        // first get the concatenated quad of outputs
+        unsigned int out = table[i * 4 + 3];
+        out <<= rate;
+        out |= table[i * 4 + 2];
+        out <<= rate;
+        out |= table[i * 4 + 1];
+        out <<= rate;
+        out |= table[i * 4];
+
+        // does this concatenated output exist in the outputs table yet?
+        if (!inv_outputs[out]) {
+            // doesn't exist, allocate a new key
+            inv_outputs[out] = output_counter;
+            quads.outputs[output_counter] = out;
+            output_counter++;
+        }
+        // set the opaque key for the ith shift register state to the concatenated output entry
+        quads.keys[i] = inv_outputs[out];
+    }
+    quads.outputs_len = output_counter;
+    quads.output_mask = (1 << (rate)) - 1;
+    quads.output_width = rate;
+    quads.distances = calloc(quads.outputs_len, sizeof(distance_quad_t));
+    free(inv_outputs);
+    return quads;
+}
+
+void quad_lookup_destroy(quad_lookup_t quads) {
+    free(quads.keys);
+    free(quads.outputs);
+    free(quads.distances);
+}
+
+void quad_lookup_fill_distance(quad_lookup_t quads, distance_t *distances) {
+    for (unsigned int i = 1; i < quads.outputs_len; i += 1) {
+        output_quad_t concat_out = quads.outputs[i];
+        unsigned int i_0 = concat_out & quads.output_mask;
+        concat_out >>= quads.output_width;
+        unsigned int i_1 = concat_out & quads.output_mask;
+        concat_out >>= quads.output_width;
+        unsigned int i_2 = concat_out & quads.output_mask;
+        concat_out >>= quads.output_width;
+        unsigned int i_3 = concat_out;
+
+        quads.distances[i] = ((uint64_t)distances[i_3] << 48) | ((uint64_t)distances[i_2] << 32) | (distances[i_1] << 16) | distances[i_0];
+    }
+}
+
+distance_oct_key_t oct_lookup_find_key(output_oct_t *outputs, output_oct_t out, size_t num_keys) {
+    for (size_t i = 1; i < num_keys; i++) {
+        if (outputs[i] == out) {
+            return i;
+        }
+    }
+    return 0;
+}
+
+oct_lookup_t oct_lookup_create(unsigned int rate,
+                                 unsigned int order,
+                                 const unsigned int *table) {
+    oct_lookup_t octs;
+
+    octs.keys = malloc((1 << (order - 3)) * sizeof(distance_oct_key_t));
+    octs.outputs = malloc(((output_oct_t)2 << rate) * sizeof(uint64_t));
+    output_oct_t *short_outs = calloc(((output_oct_t)2 << rate), sizeof(output_oct_t));
+    size_t outputs_len = 2 << rate;
+    unsigned int output_counter = 1;
+    // for every (even-numbered) shift register state, find the concatenated output of the state
+    //   and the subsequent state that follows it (low bit set). then, check to see if this
+    //   concatenated output has a unique key assigned to it already. if not, give it a key.
+    //   if it does, retrieve the key. assign this key to the shift register state.
+    for (shift_register_t i = 0; i < (1 << (order - 3)); i++) {
+        // first get the concatenated oct of outputs
+        output_oct_t out = table[i * 8 + 7];
+        out <<= rate;
+        out |= table[i * 8 + 6];
+        out <<= rate;
+        out |= table[i * 8 + 5];
+        out <<= rate;
+        out |= table[i * 8 + 4];
+        out <<= rate;
+        out |= table[i * 8 + 3];
+        out <<= rate;
+        out |= table[i * 8 + 2];
+        out <<= rate;
+        out |= table[i * 8 + 1];
+        out <<= rate;
+        out |= table[i * 8];
+
+        distance_oct_key_t key = oct_lookup_find_key(short_outs, out, output_counter);
+        // does this concatenated output exist in the outputs table yet?
+        if (!key) {
+            // doesn't exist, allocate a new key
+            // now build it in expanded form
+            output_oct_t expanded_out = table[i * 8 + 7];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 6];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 5];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 4];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 3];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 2];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 1];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8];
+
+            if (output_counter == outputs_len) {
+                octs.outputs = realloc(octs.outputs, outputs_len * 2 * sizeof(output_oct_t));
+                short_outs = realloc(short_outs, outputs_len * 2 * sizeof(output_oct_t));
+                outputs_len *= 2;
+            }
+            short_outs[output_counter] = out;
+            octs.outputs[output_counter] = expanded_out;
+            key = output_counter;
+            output_counter++;
+        }
+        // set the opaque key for the ith shift register state to the concatenated output entry
+        // we multiply the key by 2 since the distances are strided by 2
+        octs.keys[i] = key * 2;
+    }
+    free(short_outs);
+    octs.outputs_len = output_counter;
+    octs.output_mask = (1 << (rate)) - 1;
+    octs.output_width = rate;
+    octs.distances = malloc(octs.outputs_len * 2 * sizeof(uint64_t));
+    return octs;
+}
+
+void oct_lookup_destroy(oct_lookup_t octs) {
+    free(octs.keys);
+    free(octs.outputs);
+    free(octs.distances);
+}
+
+// WIP: sse approach to filling the distance table
+/*
+void oct_lookup_fill_distance_sse(oct_lookup_t octs, distance_t *distances) {
+    distance_pair_t *distance_pair = (distance_pair_t*)octs.distances;
+    __v4si index_shuffle_mask = (__v4si){0xffffff00, 0xffffff01, 0xffffff02, 0xffffff03};
+    __m256i dist_shuffle_mask = (__m256i){0x01000504, 0x09080d0c, 0xffffffff, 0xffffffff,
+                                          0x01000504, 0x09080d0c, 0xffffffff, 0xffffffff};
+    const int dist_permute_mask = 0x0c;
+    for (unsigned int i = 1; i < octs.outputs_len; i += 2) {
+        // big heaping todo vvv
+        // a) we want 16 bit distances GATHERed, not 32 bit
+        // b) we need to load 8 of those distances, not 4
+        __v4si short_concat_index = _mm_loadl_epi64(octs.outputs + 2*i);
+        __v4si short_concat_index0 = _mm_loadl_epi64(octs.outputs + 2*i + 1);
+        __m256i concat_index = _mm256_cvtepu8_epi32(short_concat_index);
+        __m256i concat_index0 = _mm256_cvtepu8_epi32(short_concat_index0);
+        __m256i dist = _mm256_i32gather_epi32(distances, concat_index, sizeof(distance_t));
+        __m256i dist0 = _mm256_i32gather_epi32(distances, concat_index0, sizeof(distance_t));
+        dist = _mm256_shuffle_epi8(dist, dist_shuffle_mask);
+        dist0 = _mm256_shuffle_epi8(dist0, dist_shuffle_mask);
+        dist = __builtin_shufflevector(dist, dist, 0, 5, 0, 0);
+        dist0 = __builtin_shufflevector(dist0, dist0, 0, 5, 0, 0);
+        __v4si packed_dist = _mm256_castsi256_si128(dist);
+        _mm_store_si128(distance_pair + 8 * i, packed_dist);
+        __v4si packed_dist0 = _mm256_castsi256_si128(dist0);
+        _mm_store_si128(distance_pair + 8 * i + 4, packed_dist0);
+    }
+}
+*/
diff --git a/falcon9_decoder/src/libcorrect/correct-sse.h b/falcon9_decoder/src/libcorrect/correct-sse.h
new file mode 100644
index 00000000..9372f195
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct-sse.h
@@ -0,0 +1,30 @@
+#ifndef CORRECT_SSE_H
+#define CORRECT_SSE_H
+#include <correct.h>
+
+struct correct_convolutional_sse;
+typedef struct correct_convolutional_sse correct_convolutional_sse;
+
+/* SSE versions of libcorrect's convolutional encoder/decoder.
+ * These instances should not be used with the non-sse functions,
+ * and non-sse instances should not be used with the sse functions.
+ */
+
+correct_convolutional_sse *correct_convolutional_sse_create(
+    size_t rate, size_t order, const correct_convolutional_polynomial_t *poly);
+
+void correct_convolutional_sse_destroy(correct_convolutional_sse *conv);
+
+size_t correct_convolutional_sse_encode_len(correct_convolutional_sse *conv, size_t msg_len);
+
+size_t correct_convolutional_sse_encode(correct_convolutional_sse *conv, const uint8_t *msg,
+                                        size_t msg_len, uint8_t *encoded);
+
+ssize_t correct_convolutional_sse_decode(correct_convolutional_sse *conv, const uint8_t *encoded,
+                                         size_t num_encoded_bits, uint8_t *msg);
+
+ssize_t correct_convolutional_sse_decode_soft(correct_convolutional_sse *conv,
+                                              const correct_convolutional_soft_t *encoded,
+                                              size_t num_encoded_bits, uint8_t *msg);
+
+#endif
diff --git a/falcon9_decoder/src/libcorrect/correct.h b/falcon9_decoder/src/libcorrect/correct.h
new file mode 100644
index 00000000..12d817db
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct.h
@@ -0,0 +1,277 @@
+#ifndef CORRECT_H
+#define CORRECT_H
+#include <stdint.h>
+
+#ifndef _MSC_VER
+#include <unistd.h>
+#else
+#include <stddef.h>
+typedef ptrdiff_t ssize_t;
+#endif
+
+
+
+// Convolutional Codes
+
+// Convolutional polynomials are 16 bits wide
+typedef uint16_t correct_convolutional_polynomial_t;
+
+static const correct_convolutional_polynomial_t correct_conv_r12_6_polynomial[] = {073, 061};
+static const correct_convolutional_polynomial_t correct_conv_r12_7_polynomial[] = {0161, 0127};
+static const correct_convolutional_polynomial_t correct_conv_r12_8_polynomial[] = {0225, 0373};
+static const correct_convolutional_polynomial_t correct_conv_r12_9_polynomial[] = {0767, 0545};
+static const correct_convolutional_polynomial_t correct_conv_r13_6_polynomial[] = {053, 075, 047};
+static const correct_convolutional_polynomial_t correct_conv_r13_7_polynomial[] = {0137, 0153,
+                                                                                   0121};
+static const correct_convolutional_polynomial_t correct_conv_r13_8_polynomial[] = {0333, 0257,
+                                                                                   0351};
+static const correct_convolutional_polynomial_t correct_conv_r13_9_polynomial[] = {0417, 0627,
+                                                                                   0675};
+
+typedef uint8_t correct_convolutional_soft_t;
+
+struct correct_convolutional;
+typedef struct correct_convolutional correct_convolutional;
+
+/* correct_convolutional_create allocates and initializes an encoder/decoder for
+ * a convolutional code with the given parameters. This function expects that
+ * poly will contain inv_rate elements. E.g., to create a conv. code instance
+ * with rate 1/2, order 7 and polynomials 0161, 0127, call
+ * correct_convolutional_create(2, 7, []correct_convolutional_polynomial_t{0161, 0127});
+ *
+ * If this call is successful, it returns a non-NULL pointer.
+ */
+correct_convolutional *correct_convolutional_create(size_t inv_rate, size_t order,
+                                                    const correct_convolutional_polynomial_t *poly);
+
+/* correct_convolutional_destroy releases all resources associated
+ * with conv. This pointer should not be used for further calls
+ * after calling destroy.
+ */
+void correct_convolutional_destroy(correct_convolutional *conv);
+
+/* correct_convolutional_encode_len returns the number of *bits*
+ * in a msg_len of given size, in *bytes*. In order to convert
+ * this returned length to bytes, save the result of the length
+ * modulo 8. If it's nonzero, then the length in bytes is
+ * length/8 + 1. If it is zero, then the length is just
+ * length/8.
+ */
+size_t correct_convolutional_encode_len(correct_convolutional *conv, size_t msg_len);
+
+/* correct_convolutional_encode uses the given conv instance to
+ * encode a block of data and write it to encoded. The length of
+ * encoded must be long enough to hold the resulting encoded length,
+ * which can be calculated by calling correct_convolutional_encode_len.
+ * However, this length should first be converted to bytes, as that
+ * function returns the length in bits.
+ *
+ * This function returns the number of bits written to encoded. If
+ * this is not an exact multiple of 8, then it occupies an additional
+ * byte.
+ */
+size_t correct_convolutional_encode(correct_convolutional *conv, const uint8_t *msg, size_t msg_len,
+                                    uint8_t *encoded);
+
+/* correct_convolutional_decode uses the given conv instance to
+ * decode a block encoded by correct_convolutional_encode. This
+ * call can cope with some bits being corrupted. This function
+ * cannot detect if there are too many bits corrupted, however,
+ * and will still write a message even if it is not recovered
+ * correctly. It is up to the user to perform checksums or CRC
+ * in order to guarantee that the decoded message is intact.
+ *
+ * num_encoded_bits should contain the length of encoded in *bits*.
+ * This value need not be an exact multiple of 8. However,
+ * it must be a multiple of the inv_rate used to create
+ * the conv instance.
+ *
+ * This function writes the result to msg, which must be large
+ * enough to hold the decoded message. A good conservative size
+ * for this buffer is the number of encoded bits multiplied by the
+ * rate of the code, e.g. for a rate 1/2 code, divide by 2. This
+ * value should then be converted to bytes to find the correct
+ * length for msg.
+ *
+ * This function returns the number of bytes written to msg. If
+ * it fails, it returns -1.
+ */
+ssize_t correct_convolutional_decode(correct_convolutional *conv, const uint8_t *encoded,
+                                     size_t num_encoded_bits, uint8_t *msg);
+
+/* correct_convolutional_decode_soft uses the given conv instance
+ * to decode a block encoded by correct_convolutional_encode and
+ * then modulated/demodulated to 8-bit symbols. This function expects
+ * that 1 is mapped to 255 and 0 to 0. An erased symbol should be
+ * set to 128. The decoded message may contain errors.
+ *
+ * num_encoded_bits should contain the length of encoded in *bits*.
+ * This value need not be an exact multiple of 8. However,
+ * it must be a multiple of the inv_rate used to create
+ * the conv instance.
+ *
+ * This function writes the result to msg, which must be large
+ * enough to hold the decoded message. A good conservative size
+ * for this buffer is the number of encoded bits multiplied by the
+ * rate of the code, e.g. for a rate 1/2 code, divide by 2. This
+ * value should then be converted to bytes to find the correct
+ * length for msg.
+ *
+ * This function returns the number of bytes written to msg. If
+ * it fails, it returns -1.
+ */
+ssize_t correct_convolutional_decode_soft(correct_convolutional *conv,
+                                          const correct_convolutional_soft_t *encoded,
+                                          size_t num_encoded_bits, uint8_t *msg);
+
+// Reed-Solomon
+
+struct correct_reed_solomon;
+typedef struct correct_reed_solomon correct_reed_solomon;
+
+static const uint16_t correct_rs_primitive_polynomial_8_4_3_2_0 =
+    0x11d;  // x^8 + x^4 + x^3 + x^2 + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_5_3_1_0 =
+    0x12b;  // x^8 + x^5 + x^3 + x + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_5_3_2_0 =
+    0x12d;  // x^8 + x^5 + x^3 + x^2 + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_6_3_2_0 =
+    0x14d;  // x^8 + x^6 + x^3 + x^2 + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_6_4_3_2_1_0 =
+    0x15f;  // x^8 + x^6 + x^4 + x^3 + x^2 + x + 1;
+
+static const uint16_t correct_rs_primitive_polynomial_8_6_5_1_0 =
+    0x163;  // x^8 + x^6 + x^5 + x + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_6_5_2_0 =
+    0x165;  // x^8 + x^6 + x^5 + x^2 + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_6_5_3_0 =
+    0x169;  // x^8 + x^6 + x^5 + x^3 + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_6_5_4_0 =
+    0x171;  // x^8 + x^6 + x^5 + x^4 + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_7_2_1_0 =
+    0x187;  // x^8 + x^7 + x^2 + x + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_7_3_2_0 =
+    0x18d;  // x^8 + x^7 + x^3 + x^2 + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_7_5_3_0 =
+    0x1a9;  // x^8 + x^7 + x^5 + x^3 + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_7_6_1_0 =
+    0x1c3;  // x^8 + x^7 + x^6 + x + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_7_6_3_2_1_0 =
+    0x1cf;  // x^8 + x^7 + x^6 + x^3 + x^2 + x + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_7_6_5_2_1_0 =
+    0x1e7;  // x^8 + x^7 + x^6 + x^5 + x^2 + x + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_7_6_5_4_2_0 =
+    0x1f5;  // x^8 + x^7 + x^6 + x^5 + x^4 + x^2 + 1
+
+static const uint16_t correct_rs_primitive_polynomial_ccsds =
+    0x187;  // x^8 + x^7 + x^2 + x + 1
+
+/* correct_reed_solomon_create allocates and initializes an
+ * encoder/decoder for a given reed solomon error correction
+ * code. The block size must be 255 bytes with 8-bit symbols.
+ *
+ * This block can repair corrupted bytes. It can handle as
+ * many as num_roots/2 bytes having corruption and still recover
+ * the encoded payload. However, using more num_roots
+ * adds more parity overhead and substantially increases
+ * the computational time for decoding.
+ *
+ * primitive_polynomial should be one of the given values in this
+ * file. Sane values for first_consecutive_root and
+ * generator_root_gap are 1 and 1. Not all combinations of
+ * values produce valid codes.
+ */
+correct_reed_solomon *correct_reed_solomon_create(uint16_t primitive_polynomial,
+                                                  uint8_t first_consecutive_root,
+                                                  uint8_t generator_root_gap,
+                                                  size_t num_roots);
+
+/* correct_reed_solomon_encode uses the rs instance to encode
+ * parity information onto a block of data. msg_length should be
+ * no more than the payload size for one block e.g. no more
+ * than 223 for a (255, 223) code. Shorter blocks will be encoded
+ * with virtual padding where the padding is not emitted.
+ *
+ * encoded should be at least msg_length + parity length bytes long
+ *
+ * It is allowable for msg and encoded to be the same pointer. In
+ * that case, the parity bytes will be written after the msg bytes
+ * end.
+ *
+ * This function returns the number of bytes written to encoded.
+ */
+ssize_t correct_reed_solomon_encode(correct_reed_solomon *rs, const uint8_t *msg, size_t msg_length,
+                                    uint8_t *encoded);
+
+/* correct_reed_solomon_decode uses the rs instance to decode
+ * a payload from a block containing payload and parity bytes.
+ * This function can recover in spite of some bytes being corrupted.
+ *
+ * In most cases, if the block is too corrupted, this function
+ * will return -1 and not perform decoding. It is possible but
+ * unlikely that the payload written to msg will contain
+ * errors when this function returns a positive value.
+ *
+ * msg should be long enough to contain a decoded payload for
+ * this encoded block.
+ *
+ * This function returns a positive number of bytes written to msg
+ * if it has decoded or -1 if it has encountered an error.
+ */
+ssize_t correct_reed_solomon_decode(correct_reed_solomon *rs, const uint8_t *encoded,
+                                    size_t encoded_length, uint8_t *msg);
+
+/* correct_reed_solomon_decode_with_erasures uses the rs
+ * instance to decode a payload from a block containing payload
+ * and parity bytes. Additionally, the user can provide the
+ * indices of bytes which have been suspected to be corrupted.
+ * This erasure information is typically provided by a demodulating
+ * or receiving device. This function can recover with
+ * some additional errors on top of the erasures.
+ *
+ * In order to successfully decode, the quantity
+ * (num_erasures + 2*num_errors) must be less than
+ * num_roots.
+ *
+ * erasure_locations shold contain erasure_length items.
+ * erasure_length should not exceed the number of parity
+ * bytes encoded into this block.
+ *
+ * In most cases, if the block is too corrupted, this function
+ * will return -1 and not perform decoding. It is possible but
+ * unlikely that the payload written to msg will contain
+ * errors when this function returns a positive value.
+ *
+ * msg should be long enough to contain a decoded payload for
+ * this encoded block.
+ *
+ * This function returns a positive number of bytes written to msg
+ * if it has decoded or -1 if it has encountered an error.
+ */
+ssize_t correct_reed_solomon_decode_with_erasures(correct_reed_solomon *rs, const uint8_t *encoded,
+                                                  size_t encoded_length,
+                                                  const uint8_t *erasure_locations,
+                                                  size_t erasure_length, uint8_t *msg);
+
+/* correct_reed_solomon_destroy releases the resources
+ * associated with rs. This pointer should not be
+ * used for any functions after this call.
+ */
+void correct_reed_solomon_destroy(correct_reed_solomon *rs);
+
+#endif
+
diff --git a/falcon9_decoder/src/libcorrect/correct/convolutional.h b/falcon9_decoder/src/libcorrect/correct/convolutional.h
new file mode 100644
index 00000000..06b17108
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct/convolutional.h
@@ -0,0 +1,28 @@
+#ifndef CORRECT_CONVOLUTIONAL
+#define CORRECT_CONVOLUTIONAL
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+#include <assert.h>
+
+#include "correct.h"
+#include "correct/portable.h"
+
+typedef unsigned int shift_register_t;
+typedef uint16_t polynomial_t;
+typedef uint64_t path_t;
+typedef uint8_t soft_t;
+static const soft_t soft_max = UINT8_MAX;
+
+typedef uint16_t distance_t;
+static const distance_t distance_max = UINT16_MAX;
+
+typedef enum {
+    CORRECT_SOFT_LINEAR,
+    CORRECT_SOFT_QUADRATIC,
+} soft_measurement_t;
+#endif
diff --git a/falcon9_decoder/src/libcorrect/correct/convolutional/bit.h b/falcon9_decoder/src/libcorrect/correct/convolutional/bit.h
new file mode 100644
index 00000000..64838986
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct/convolutional/bit.h
@@ -0,0 +1,44 @@
+#ifndef CORRECT_CONVOLUTIONAL_BIT
+#define CORRECT_CONVOLUTIONAL_BIT
+#include "correct/convolutional.h"
+
+typedef struct {
+    uint8_t current_byte;
+    unsigned int current_byte_len;
+    uint8_t *bytes;
+    size_t byte_index;
+    size_t len;
+} bit_writer_t;
+
+bit_writer_t *bit_writer_create(uint8_t *bytes, size_t len);
+
+void bit_writer_reconfigure(bit_writer_t *w, uint8_t *bytes, size_t len);
+
+void bit_writer_destroy(bit_writer_t *w);
+
+void bit_writer_write(bit_writer_t *w, uint8_t val, unsigned int n);
+
+void bit_writer_write_1(bit_writer_t *w, uint8_t val);
+
+void bit_writer_write_bitlist_reversed(bit_writer_t *w, uint8_t *l, size_t len);
+
+void bit_writer_flush_byte(bit_writer_t *w);
+
+size_t bit_writer_length(bit_writer_t *w);
+
+typedef struct {
+    uint8_t current_byte;
+    size_t byte_index;
+    size_t len;
+    size_t current_byte_len;
+    const uint8_t *bytes;
+} bit_reader_t;
+
+bit_reader_t *bit_reader_create(const uint8_t *bytes, size_t len);
+
+void bit_reader_reconfigure(bit_reader_t *r, const uint8_t *bytes, size_t len);
+
+void bit_reader_destroy(bit_reader_t *r);
+
+uint8_t bit_reader_read(bit_reader_t *r, unsigned int n);
+#endif
diff --git a/falcon9_decoder/src/libcorrect/correct/convolutional/convolutional.h b/falcon9_decoder/src/libcorrect/correct/convolutional/convolutional.h
new file mode 100644
index 00000000..b0f66f09
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct/convolutional/convolutional.h
@@ -0,0 +1,40 @@
+#ifndef CORRECT_CONVOLUTIONAL_H
+#define CORRECT_CONVOLUTIONAL_H
+#include "correct/convolutional.h"
+#include "correct/convolutional/bit.h"
+#include "correct/convolutional/metric.h"
+#include "correct/convolutional/lookup.h"
+#include "correct/convolutional/history_buffer.h"
+#include "correct/convolutional/error_buffer.h"
+
+struct correct_convolutional {
+    const unsigned int *table;  // size 2**order
+    size_t rate;                // e.g. 2, 3...
+    size_t order;               // e.g. 7, 9...
+    unsigned int numstates;     // 2**order
+    bit_writer_t *bit_writer;
+    bit_reader_t *bit_reader;
+
+    bool has_init_decode;
+    distance_t *distances;
+    pair_lookup_t pair_lookup;
+    soft_measurement_t soft_measurement;
+    history_buffer *history_buffer;
+    error_buffer_t *errors;
+};
+
+correct_convolutional *_correct_convolutional_init(correct_convolutional *conv,
+                                                   size_t rate, size_t order,
+                                                   const polynomial_t *poly);
+void _correct_convolutional_teardown(correct_convolutional *conv);
+
+// portable versions
+void _convolutional_decode_init(correct_convolutional *conv, unsigned int min_traceback, unsigned int traceback_length, unsigned int renormalize_interval);
+void convolutional_decode_warmup(correct_convolutional *conv, unsigned int sets,
+                                 const uint8_t *soft);
+void convolutional_decode_inner(correct_convolutional *conv, unsigned int sets,
+                                const uint8_t *soft);
+void convolutional_decode_tail(correct_convolutional *conv, unsigned int sets,
+                               const uint8_t *soft);
+#endif
+
diff --git a/falcon9_decoder/src/libcorrect/correct/convolutional/error_buffer.h b/falcon9_decoder/src/libcorrect/correct/convolutional/error_buffer.h
new file mode 100644
index 00000000..12e3f756
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct/convolutional/error_buffer.h
@@ -0,0 +1,15 @@
+#include "correct/convolutional.h"
+
+typedef struct {
+    unsigned int index;
+    distance_t *errors[2];
+    unsigned int num_states;
+
+    const distance_t *read_errors;
+    distance_t *write_errors;
+} error_buffer_t;
+
+error_buffer_t *error_buffer_create(unsigned int num_states);
+void error_buffer_destroy(error_buffer_t *buf);
+void error_buffer_reset(error_buffer_t *buf);
+void error_buffer_swap(error_buffer_t *buf);
diff --git a/falcon9_decoder/src/libcorrect/correct/convolutional/history_buffer.h b/falcon9_decoder/src/libcorrect/correct/convolutional/history_buffer.h
new file mode 100644
index 00000000..ca01e21b
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct/convolutional/history_buffer.h
@@ -0,0 +1,59 @@
+#include "correct/convolutional.h"
+#include "correct/convolutional/bit.h"
+
+// ring buffer of path histories
+// generates output bits after accumulating sufficient history
+typedef struct {
+    // history entries must be at least this old to be decoded
+    const unsigned int min_traceback_length;
+    // we'll decode entries in bursts. this tells us the length of the burst
+    const unsigned int traceback_group_length;
+    // we will store a total of cap entries. equal to min_traceback_length +
+    // traceback_group_length
+    const unsigned int cap;
+
+    // how many states in the shift register? this is one of the dimensions of
+    // history table
+    const unsigned int num_states;
+    // what's the high order bit of the shift register?
+    const shift_register_t highbit;
+
+    // history is a compact history representation for every shift register
+    // state,
+    //    one bit per time slice
+    uint8_t **history;
+
+    // which slice are we writing next?
+    unsigned int index;
+
+    // how many valid entries are there?
+    unsigned int len;
+
+    // temporary store of fetched bits
+    uint8_t *fetched;
+
+    // how often should we renormalize?
+    unsigned int renormalize_interval;
+    unsigned int renormalize_counter;
+} history_buffer;
+
+history_buffer *history_buffer_create(unsigned int min_traceback_length,
+                                      unsigned int traceback_group_length,
+                                      unsigned int renormalize_interval,
+                                      unsigned int num_states,
+                                      shift_register_t highbit);
+void history_buffer_destroy(history_buffer *buf);
+void history_buffer_reset(history_buffer *buf);
+void history_buffer_step(history_buffer *buf);
+uint8_t *history_buffer_get_slice(history_buffer *buf);
+shift_register_t history_buffer_search(history_buffer *buf,
+                                       const distance_t *distances,
+                                       unsigned int search_every);
+void history_buffer_traceback(history_buffer *buf, shift_register_t bestpath,
+                              unsigned int min_traceback_length,
+                              bit_writer_t *output);
+void history_buffer_process_skip(history_buffer *buf, distance_t *distances,
+                                 bit_writer_t *output, unsigned int skip);
+void history_buffer_process(history_buffer *buf, distance_t *distances,
+                            bit_writer_t *output);
+void history_buffer_flush(history_buffer *buf, bit_writer_t *output);
diff --git a/falcon9_decoder/src/libcorrect/correct/convolutional/lookup.h b/falcon9_decoder/src/libcorrect/correct/convolutional/lookup.h
new file mode 100644
index 00000000..c2573897
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct/convolutional/lookup.h
@@ -0,0 +1,27 @@
+#ifndef CORRECT_CONVOLUTIONAL_LOOKUP
+#define CORRECT_CONVOLUTIONAL_LOOKUP
+#include "correct/convolutional.h"
+
+typedef unsigned int distance_pair_key_t;
+typedef uint32_t output_pair_t;
+typedef uint32_t distance_pair_t;
+
+typedef struct {
+    distance_pair_key_t *keys;
+    output_pair_t *outputs;
+    output_pair_t output_mask;
+    unsigned int output_width;
+    size_t outputs_len;
+    distance_pair_t *distances;
+} pair_lookup_t;
+
+void fill_table(unsigned int order,
+                unsigned int rate,
+                const polynomial_t *poly,
+                unsigned int *table);
+pair_lookup_t pair_lookup_create(unsigned int rate,
+                                 unsigned int order,
+                                 const unsigned int *table);
+void pair_lookup_destroy(pair_lookup_t pairs);
+void pair_lookup_fill_distance(pair_lookup_t pairs, distance_t *distances);
+#endif
diff --git a/falcon9_decoder/src/libcorrect/correct/convolutional/metric.h b/falcon9_decoder/src/libcorrect/correct/convolutional/metric.h
new file mode 100644
index 00000000..5e1cd55a
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct/convolutional/metric.h
@@ -0,0 +1,20 @@
+#include "correct/convolutional.h"
+
+// measure the hamming distance of two bit strings
+// implemented as population count of x XOR y
+static inline distance_t metric_distance(unsigned int x, unsigned int y) {
+    return popcount(x ^ y);
+}
+
+static inline distance_t metric_soft_distance_linear(unsigned int hard_x, const uint8_t *soft_y, size_t len) {
+    distance_t dist = 0;
+    for (unsigned int i = 0; i < len; i++) {
+        unsigned int soft_x = ((int8_t)(0) - (hard_x & 1)) & 0xff;
+        hard_x >>= 1;
+        int d = soft_y[i] - soft_x;
+        dist += (d < 0) ? -d : d;
+    }
+    return dist;
+}
+
+distance_t metric_soft_distance_quadratic(unsigned int hard_x, const uint8_t *soft_y, size_t len);
diff --git a/falcon9_decoder/src/libcorrect/correct/convolutional/sse/convolutional.h b/falcon9_decoder/src/libcorrect/correct/convolutional/sse/convolutional.h
new file mode 100644
index 00000000..bf4ad880
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct/convolutional/sse/convolutional.h
@@ -0,0 +1,15 @@
+#include "correct/convolutional/convolutional.h"
+#include "correct/convolutional/sse/lookup.h"
+// BIG HEAPING TODO sort out the include mess
+#include "correct-sse.h"
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+
+struct correct_convolutional_sse {
+    correct_convolutional base_conv;
+    oct_lookup_t oct_lookup;
+};
diff --git a/falcon9_decoder/src/libcorrect/correct/convolutional/sse/lookup.h b/falcon9_decoder/src/libcorrect/correct/convolutional/sse/lookup.h
new file mode 100644
index 00000000..53e6e8a5
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct/convolutional/sse/lookup.h
@@ -0,0 +1,65 @@
+#include "correct/convolutional/lookup.h"
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+typedef unsigned int distance_quad_key_t;
+typedef unsigned int output_quad_t;
+typedef uint64_t distance_quad_t;
+
+typedef struct {
+    distance_quad_key_t *keys;
+    output_quad_t *outputs;
+    output_quad_t output_mask;
+    unsigned int output_width;
+    size_t outputs_len;
+    distance_quad_t *distances;
+} quad_lookup_t;
+
+typedef uint16_t distance_oct_key_t;
+typedef uint64_t output_oct_t;
+typedef uint64_t distance_oct_t;
+
+typedef struct {
+    distance_oct_key_t *keys;
+    output_oct_t *outputs;
+    output_oct_t output_mask;
+    unsigned int output_width;
+    size_t outputs_len;
+    distance_oct_t *distances;
+} oct_lookup_t;
+
+quad_lookup_t quad_lookup_create(unsigned int rate,
+                                 unsigned int order,
+                                 const unsigned int *table);
+void quad_lookup_destroy(quad_lookup_t quads);
+void quad_lookup_fill_distance(quad_lookup_t quads, distance_t *distances);
+distance_oct_key_t oct_lookup_find_key(output_oct_t *outputs, output_oct_t out, size_t num_keys);
+oct_lookup_t oct_lookup_create(unsigned int rate,
+                                 unsigned int order,
+                                 const unsigned int *table);
+void oct_lookup_destroy(oct_lookup_t octs);
+static inline void oct_lookup_fill_distance(oct_lookup_t octs, distance_t *distances) {
+    distance_pair_t *pairs = (distance_pair_t*)octs.distances;
+    for (unsigned int i = 1; i < octs.outputs_len; i += 1) {
+        output_oct_t concat_out = octs.outputs[i];
+        unsigned int i_0 = concat_out & 0xff;
+        unsigned int i_1 = (concat_out >> 8) & 0xff;
+        unsigned int i_2 = (concat_out >> 16) & 0xff;
+        unsigned int i_3 = (concat_out >> 24) & 0xff;
+
+        pairs[i*4 + 1] = distances[i_3] << 16 | distances[i_2];
+        pairs[i*4 + 0] = distances[i_1] << 16 | distances[i_0];
+
+        concat_out >>= 32;
+        unsigned int i_4 = concat_out & 0xff;
+        unsigned int i_5 = (concat_out >> 8) & 0xff;
+        unsigned int i_6 = (concat_out >> 16) & 0xff;
+        unsigned int i_7 = (concat_out >> 24) & 0xff;
+
+        pairs[i*4 + 3] = distances[i_7] << 16 | distances[i_6];
+        pairs[i*4 + 2] = distances[i_5] << 16 | distances[i_4];
+    }
+}
diff --git a/falcon9_decoder/src/libcorrect/correct/portable.h b/falcon9_decoder/src/libcorrect/correct/portable.h
new file mode 100644
index 00000000..0c62e9fb
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct/portable.h
@@ -0,0 +1,20 @@
+#ifdef __GNUC__
+#define HAVE_BUILTINS
+#endif
+
+
+#ifdef HAVE_BUILTINS
+#define popcount __builtin_popcount
+#define prefetch __builtin_prefetch
+#else
+
+static inline int popcount(int x) {
+    /* taken from the helpful http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel */
+    x = x - ((x >> 1) & 0x55555555);
+    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+    return ((x + (x >> 4) & 0x0f0f0f0f) * 0x01010101) >> 24;
+}
+
+static inline void prefetch(void *x) {}
+
+#endif
diff --git a/falcon9_decoder/src/libcorrect/correct/reed-solomon.h b/falcon9_decoder/src/libcorrect/correct/reed-solomon.h
new file mode 100644
index 00000000..94d3efbb
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct/reed-solomon.h
@@ -0,0 +1,76 @@
+#ifndef CORRECT_REED_SOLOMON
+#define CORRECT_REED_SOLOMON
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include <time.h>
+#include <stdint.h>
+
+#include "correct.h"
+#include "correct/portable.h"
+
+// an element in GF(2^8)
+typedef uint8_t field_element_t;
+
+// a power of the primitive element alpha
+typedef uint8_t field_logarithm_t;
+
+// give us some bits of headroom to do arithmetic
+// variables of this type aren't really in any proper space
+typedef uint16_t field_operation_t;
+
+// generated by find_poly
+typedef struct {
+    const field_element_t *exp;
+    const field_logarithm_t *log;
+} field_t;
+
+typedef struct {
+    field_element_t *coeff;
+    unsigned int order;
+} polynomial_t;
+
+struct correct_reed_solomon {
+    size_t block_length;
+    size_t message_length;
+    size_t min_distance;
+
+    field_logarithm_t first_consecutive_root;
+    field_logarithm_t generator_root_gap;
+
+    field_t field;
+
+    polynomial_t generator;
+    field_element_t *generator_roots;
+    field_logarithm_t **generator_root_exp;
+
+    polynomial_t encoded_polynomial;
+    polynomial_t encoded_remainder;
+
+    field_element_t *syndromes;
+    field_element_t *modified_syndromes;
+    polynomial_t received_polynomial;
+    polynomial_t error_locator;
+    polynomial_t error_locator_log;
+    polynomial_t erasure_locator;
+    field_element_t *error_roots;
+    field_element_t *error_vals;
+    field_logarithm_t *error_locations;
+
+    field_logarithm_t **element_exp;
+
+    // scratch
+    // (do no allocations at steady state)
+
+    // used during find_error_locator
+    polynomial_t last_error_locator;
+
+    // used during error value search
+    polynomial_t error_evaluator;
+    polynomial_t error_locator_derivative;
+    polynomial_t init_from_roots_scratch[2];
+    bool has_init_decode;
+
+};
+#endif
diff --git a/falcon9_decoder/src/libcorrect/correct/reed-solomon/decode.h b/falcon9_decoder/src/libcorrect/correct/reed-solomon/decode.h
new file mode 100644
index 00000000..db1e011c
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct/reed-solomon/decode.h
@@ -0,0 +1,3 @@
+#include "correct/reed-solomon.h"
+#include "correct/reed-solomon/field.h"
+#include "correct/reed-solomon/polynomial.h"
diff --git a/falcon9_decoder/src/libcorrect/correct/reed-solomon/encode.h b/falcon9_decoder/src/libcorrect/correct/reed-solomon/encode.h
new file mode 100644
index 00000000..db1e011c
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct/reed-solomon/encode.h
@@ -0,0 +1,3 @@
+#include "correct/reed-solomon.h"
+#include "correct/reed-solomon/field.h"
+#include "correct/reed-solomon/polynomial.h"
diff --git a/falcon9_decoder/src/libcorrect/correct/reed-solomon/field.h b/falcon9_decoder/src/libcorrect/correct/reed-solomon/field.h
new file mode 100644
index 00000000..ef656a59
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct/reed-solomon/field.h
@@ -0,0 +1,167 @@
+#ifndef CORRECT_REED_SOLOMON_FIELD
+#define CORRECT_REED_SOLOMON_FIELD
+#include "correct/reed-solomon.h"
+
+/*
+field_t field_create(field_operation_t primitive_poly);
+void field_destroy(field_t field);
+field_element_t field_add(field_t field, field_element_t l, field_element_t r);
+field_element_t field_sub(field_t field, field_element_t l, field_element_t r);
+field_element_t field_sum(field_t field, field_element_t elem, unsigned int n);
+field_element_t field_mul(field_t field, field_element_t l, field_element_t r);
+field_element_t field_div(field_t field, field_element_t l, field_element_t r);
+field_logarithm_t field_mul_log(field_t field, field_logarithm_t l, field_logarithm_t r);
+field_logarithm_t field_div_log(field_t field, field_logarithm_t l, field_logarithm_t r);
+field_element_t field_mul_log_element(field_t field, field_logarithm_t l, field_logarithm_t r);
+field_element_t field_pow(field_t field, field_element_t elem, int pow);
+*/
+
+static inline field_element_t field_mul_log_element(field_t field, field_logarithm_t l, field_logarithm_t r) {
+    // like field_mul_log, but returns a field_element_t
+    // because we are doing lookup here, we can safely skip the wrapover check
+    field_operation_t res = (field_operation_t)l + (field_operation_t)r;
+    return field.exp[res];
+}
+
+static inline field_t field_create(field_operation_t primitive_poly) {
+    // in GF(2^8)
+    // log and exp
+    // bits are in GF(2), compute alpha^val in GF(2^8)
+    // exp should be of size 512 so that it can hold a "wraparound" which prevents some modulo ops
+    // log should be of size 256. no wraparound here, the indices into this table are field elements
+    field_element_t *exp = malloc(512 * sizeof(field_element_t));
+    field_logarithm_t *log = malloc(256 * sizeof(field_logarithm_t));
+
+    // assume alpha is a primitive element, p(x) (primitive_poly) irreducible in GF(2^8)
+    // addition is xor
+    // subtraction is addition (also xor)
+    // e.g. x^5 + x^4 + x^4 + x^2 + 1 = x^5 + x^2 + 1
+    // each row of exp contains the field element found by exponentiating
+    //   alpha by the row index
+    // each row of log contains the coefficients of
+    //   alpha^7 + alpha^6 + alpha^5 + alpha^4 + alpha^3 + alpha^2 + alpha + 1
+    // as 8 bits packed into one byte
+
+    field_operation_t element = 1;
+    exp[0] = (field_element_t)element;
+    log[0] = (field_logarithm_t)0;  // really, it's undefined. we shouldn't ever access this
+    for (field_operation_t i = 1; i < 512; i++) {
+        element = element * 2;
+        element = (element > 255) ? (element ^ primitive_poly) : element;
+        exp[i] = (field_element_t)element;
+        if (i < 256) {
+            log[element] = (field_logarithm_t)i;
+        }
+    }
+
+    field_t field;
+    *(field_element_t **)&field.exp = exp;
+    *(field_logarithm_t **)&field.log = log;
+
+    return field;
+}
+
+static inline void field_destroy(field_t field) {
+    free(*(field_element_t **)&field.exp);
+    free(*(field_element_t **)&field.log);
+}
+
+static inline field_element_t field_add(field_t field, field_element_t l, field_element_t r) {
+    return l ^ r;
+}
+
+static inline field_element_t field_sub(field_t field, field_element_t l, field_element_t r) {
+    return l ^ r;
+}
+
+static inline field_element_t field_sum(field_t field, field_element_t elem, unsigned int n) {
+    // we'll do a closed-form expression of the sum, although we could also
+    //   choose to call field_add n times
+
+    // since the sum is actually the bytewise XOR operator, this suggests two
+    // kinds of values: n odd, and n even
+
+    // if you sum once, you have coeff
+    // if you sum twice, you have coeff XOR coeff = 0
+    // if you sum thrice, you are back at coeff
+    // an even number of XORs puts you at 0
+    // an odd number of XORs puts you back at your value
+
+    // so, just throw away all the even n
+    return (n % 2) ? elem : 0;
+}
+
+static inline field_element_t field_mul(field_t field, field_element_t l, field_element_t r) {
+    if (l == 0 || r == 0) {
+        return 0;
+    }
+    // multiply two field elements by adding their logarithms.
+    // yep, get your slide rules out
+    field_operation_t res = (field_operation_t)field.log[l] + (field_operation_t)field.log[r];
+
+    // if coeff exceeds 255, we would normally have to wrap it back around
+    // alpha^255 = 1; alpha^256 = alpha^255 * alpha^1 = alpha^1
+    // however, we've constructed exponentiation table so that
+    //   we can just directly lookup this result
+    // the result must be clamped to [0, 511]
+    // the greatest we can see at this step is alpha^255 * alpha^255
+    //   = alpha^510
+    return field.exp[res];
+}
+
+static inline field_element_t field_div(field_t field, field_element_t l, field_element_t r) {
+    if (l == 0) {
+        return 0;
+    }
+
+    if (r == 0) {
+        // XXX ???
+        return 0;
+    }
+
+    // division as subtraction of logarithms
+
+    // if rcoeff is larger, then log[l] - log[r] wraps under
+    // so, instead, always add 255. in some cases, we'll wrap over, but
+    // that's ok because the exp table runs up to 511.
+    field_operation_t res = (field_operation_t)255 + (field_operation_t)field.log[l] - (field_operation_t)field.log[r];
+    return field.exp[res];
+}
+
+static inline field_logarithm_t field_mul_log(field_t field, field_logarithm_t l, field_logarithm_t r) {
+    // this function performs the equivalent of field_mul on two logarithms
+    // we save a little time by skipping the lookup step at the beginning
+    field_operation_t res = (field_operation_t)l + (field_operation_t)r;
+
+    // because we arent using the table, the value we return must be a valid logarithm
+    // which we have decided must live in [0, 255] (they are 8-bit values)
+    // ensuring this makes it so that multiple muls will not reach past the end of the
+    // exp table whenever we finally convert back to an element
+    if (res > 255) {
+        return (field_logarithm_t)(res - 255);
+    }
+    return (field_logarithm_t)res;
+}
+
+static inline field_logarithm_t field_div_log(field_t field, field_logarithm_t l, field_logarithm_t r) {
+    // like field_mul_log, this performs field_div without going through a field_element_t
+    field_operation_t res = (field_operation_t)255 + (field_operation_t)l - (field_operation_t)r;
+    if (res > 255) {
+        return (field_logarithm_t)(res - 255);
+    }
+    return (field_logarithm_t)res;
+}
+
+static inline field_element_t field_pow(field_t field, field_element_t elem, int pow) {
+    // take the logarithm, multiply, and then "exponentiate"
+    // n.b. the exp table only considers powers of alpha, the primitive element
+    // but here we have an arbitrary coeff
+    field_logarithm_t log = field.log[elem];
+    int res_log = log * pow;
+    int mod = res_log % 255;
+    if (mod < 0) {
+        mod += 255;
+    }
+    return field.exp[mod];
+}
+#endif
diff --git a/falcon9_decoder/src/libcorrect/correct/reed-solomon/polynomial.h b/falcon9_decoder/src/libcorrect/correct/reed-solomon/polynomial.h
new file mode 100644
index 00000000..e084e11e
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct/reed-solomon/polynomial.h
@@ -0,0 +1,14 @@
+#include "correct/reed-solomon.h"
+#include "correct/reed-solomon/field.h"
+
+polynomial_t polynomial_create(unsigned int order);
+void polynomial_destroy(polynomial_t polynomial);
+void polynomial_mul(field_t field, polynomial_t l, polynomial_t r, polynomial_t res);
+void polynomial_mod(field_t field, polynomial_t dividend, polynomial_t divisor, polynomial_t mod);
+void polynomial_formal_derivative(field_t field, polynomial_t poly, polynomial_t der);
+field_element_t polynomial_eval(field_t field, polynomial_t poly, field_element_t val);
+field_element_t polynomial_eval_lut(field_t field, polynomial_t poly, const field_logarithm_t *val_exp);
+field_element_t polynomial_eval_log_lut(field_t field, polynomial_t poly_log, const field_logarithm_t *val_exp);
+void polynomial_build_exp_lut(field_t field, field_element_t val, unsigned int order, field_logarithm_t *val_exp);
+polynomial_t polynomial_init_from_roots(field_t field, unsigned int nroots, field_element_t *roots, polynomial_t poly, polynomial_t *scratch);
+polynomial_t polynomial_create_from_roots(field_t field, unsigned int nroots, field_element_t *roots);
diff --git a/falcon9_decoder/src/libcorrect/correct/reed-solomon/reed-solomon.h b/falcon9_decoder/src/libcorrect/correct/reed-solomon/reed-solomon.h
new file mode 100644
index 00000000..db1e011c
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct/reed-solomon/reed-solomon.h
@@ -0,0 +1,3 @@
+#include "correct/reed-solomon.h"
+#include "correct/reed-solomon/field.h"
+#include "correct/reed-solomon/polynomial.h"
diff --git a/falcon9_decoder/src/libcorrect/correct/util/error-sim-fec.h b/falcon9_decoder/src/libcorrect/correct/util/error-sim-fec.h
new file mode 100644
index 00000000..b3deeb1d
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct/util/error-sim-fec.h
@@ -0,0 +1,8 @@
+#include "correct/util/error-sim.h"
+
+#include <fec.h>
+
+void conv_fec27_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
+void conv_fec29_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
+void conv_fec39_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
+void conv_fec615_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
diff --git a/falcon9_decoder/src/libcorrect/correct/util/error-sim-shim.h b/falcon9_decoder/src/libcorrect/correct/util/error-sim-shim.h
new file mode 100644
index 00000000..aed1fc14
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct/util/error-sim-shim.h
@@ -0,0 +1,7 @@
+#include "correct/util/error-sim.h"
+#include "fec_shim.h"
+
+ssize_t conv_shim27_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
+ssize_t conv_shim29_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
+ssize_t conv_shim39_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
+ssize_t conv_shim615_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
diff --git a/falcon9_decoder/src/libcorrect/correct/util/error-sim-sse.h b/falcon9_decoder/src/libcorrect/correct/util/error-sim-sse.h
new file mode 100644
index 00000000..2447854b
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct/util/error-sim-sse.h
@@ -0,0 +1,7 @@
+#include "correct/util/error-sim.h"
+
+#include "correct-sse.h"
+
+size_t conv_correct_sse_enclen(void *conv_v, size_t msg_len);
+void conv_correct_sse_encode(void *conv_v, uint8_t *msg, size_t msg_len, uint8_t *encoded);
+ssize_t conv_correct_sse_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
diff --git a/falcon9_decoder/src/libcorrect/correct/util/error-sim.h b/falcon9_decoder/src/libcorrect/correct/util/error-sim.h
new file mode 100644
index 00000000..e75c5a1e
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/correct/util/error-sim.h
@@ -0,0 +1,47 @@
+#include <stdbool.h>
+#include <math.h>
+#include <string.h>
+#include <stdlib.h>
+#include <float.h>
+#include <stdio.h>
+
+#include "correct.h"
+#include "correct/portable.h"
+
+size_t distance(uint8_t *a, uint8_t *b, size_t len);
+void gaussian(double *res, size_t n_res, double sigma);
+
+void encode_bpsk(uint8_t *msg, double *voltages, size_t n_syms, double bpsk_voltage);
+void byte2bit(uint8_t *bytes, uint8_t *bits, size_t n_bits);
+void decode_bpsk(uint8_t *soft, uint8_t *msg, size_t n_syms);
+void decode_bpsk_soft(double *voltages, uint8_t *soft, size_t n_syms, double bpsk_voltage);
+double log2amp(double l);
+double amp2log(double a);
+double sigma_for_eb_n0(double eb_n0, double bpsk_bit_energy);
+void build_white_noise(double *noise, size_t n_syms, double eb_n0, double bpsk_bit_energy);
+void add_white_noise(double *signal, double *noise, size_t n_syms);
+
+typedef struct {
+    uint8_t *msg_out;
+    size_t msg_len;
+    uint8_t *encoded;
+    double *v;
+    double *corrupted;
+    uint8_t *soft;
+    double *noise;
+    size_t enclen;
+    size_t enclen_bytes;
+    void (*encode)(void *, uint8_t *msg, size_t msg_len, uint8_t *encoded);
+    void *encoder;
+    ssize_t (*decode)(void *, uint8_t *soft, size_t soft_len, uint8_t *msg);
+    void *decoder;
+} conv_testbench;
+
+conv_testbench *resize_conv_testbench(conv_testbench *scratch, size_t (*enclen)(void *, size_t), void *enc, size_t msg_len);
+void free_scratch(conv_testbench *scratch);
+int test_conv_noise(conv_testbench *scratch, uint8_t *msg, size_t n_bytes,
+                    double bpsk_voltage);
+
+size_t conv_correct_enclen(void *conv_v, size_t msg_len);
+void conv_correct_encode(void *conv_v, uint8_t *msg, size_t msg_len, uint8_t *encoded);
+ssize_t conv_correct_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
diff --git a/falcon9_decoder/src/libcorrect/fec_shim.c b/falcon9_decoder/src/libcorrect/fec_shim.c
new file mode 100644
index 00000000..146c60ce
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/fec_shim.c
@@ -0,0 +1,255 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include "fec_shim.h"
+
+typedef struct {
+    correct_reed_solomon *rs;
+    unsigned int msg_length;
+    unsigned int block_length;
+    unsigned int num_roots;
+    uint8_t *msg_out;
+    unsigned int pad;
+    uint8_t *erasures;
+} reed_solomon_shim;
+
+void *init_rs_char(int symbol_size, int primitive_polynomial,
+                   int first_consecutive_root, int root_gap, int number_roots,
+                   unsigned int pad) {
+    if (symbol_size != 8) {
+        return NULL;
+    }
+
+    reed_solomon_shim *shim = malloc(sizeof(reed_solomon_shim));
+
+    shim->pad = pad;
+    shim->block_length = 255 - pad;
+    shim->num_roots = number_roots;
+    shim->msg_length = shim->block_length - number_roots;
+    shim->rs = correct_reed_solomon_create(primitive_polynomial,
+                                           first_consecutive_root, root_gap, number_roots);
+    shim->msg_out = malloc(shim->block_length);
+    shim->erasures = malloc(number_roots);
+
+    return shim;
+}
+
+void free_rs_char(void *rs) {
+    reed_solomon_shim *shim = (reed_solomon_shim *)rs;
+    correct_reed_solomon_destroy(shim->rs);
+    free(shim->msg_out);
+    free(shim->erasures);
+    free(shim);
+}
+
+void encode_rs_char(void *rs, const unsigned char *msg, unsigned char *parity) {
+    reed_solomon_shim *shim = (reed_solomon_shim *)rs;
+    correct_reed_solomon_encode(shim->rs, msg, shim->msg_length, shim->msg_out);
+    memcpy(parity, shim->msg_out + shim->msg_length, shim->num_roots);
+}
+
+void decode_rs_char(void *rs, unsigned char *block, int *erasure_locations,
+                    int num_erasures) {
+    reed_solomon_shim *shim = (reed_solomon_shim *)rs;
+    for (int i = 0; i < num_erasures; i++) {
+        shim->erasures[i] = (uint8_t)(erasure_locations[i]) - shim->pad;
+    }
+    correct_reed_solomon_decode_with_erasures(shim->rs, block, shim->block_length,
+                                              shim->erasures, num_erasures,
+                                              block);
+}
+
+typedef struct {
+    correct_convolutional *conv;
+    unsigned int rate;
+    unsigned int order;
+    uint8_t *buf;
+    size_t buf_len;
+    uint8_t *read_iter;
+    uint8_t *write_iter;
+} convolutional_shim;
+
+static correct_convolutional_polynomial_t r12k7[] = {V27POLYA, V27POLYB};
+
+static correct_convolutional_polynomial_t r12k9[] = {V29POLYA, V29POLYB};
+
+static correct_convolutional_polynomial_t r13k9[] = {V39POLYA, V39POLYB,
+                                                     V39POLYC};
+
+static correct_convolutional_polynomial_t r16k15[] = {
+    V615POLYA, V615POLYB, V615POLYC, V615POLYD, V615POLYE, V615POLYF};
+
+/* Common methods */
+static void *create_viterbi(unsigned int num_decoded_bits, unsigned int rate,
+                            unsigned int order,
+                            correct_convolutional_polynomial_t *poly) {
+    convolutional_shim *shim = malloc(sizeof(convolutional_shim));
+
+    size_t num_decoded_bytes = (num_decoded_bits % 8)
+                                   ? (num_decoded_bits / 8 + 1)
+                                   : num_decoded_bits / 8;
+
+    shim->rate = rate;
+    shim->order = order;
+    shim->buf = malloc(num_decoded_bytes);
+    shim->buf_len = num_decoded_bytes;
+    shim->conv = correct_convolutional_create(rate, order, poly);
+    shim->read_iter = shim->buf;
+    shim->write_iter = shim->buf;
+
+    return shim;
+}
+
+static void delete_viterbi(void *vit) {
+    convolutional_shim *shim = (convolutional_shim *)vit;
+    free(shim->buf);
+    correct_convolutional_destroy(shim->conv);
+    free(shim);
+}
+
+static void init_viterbi(void *vit) {
+    convolutional_shim *shim = (convolutional_shim *)vit;
+    shim->read_iter = shim->buf;
+    shim->write_iter = shim->buf;
+}
+
+static void update_viterbi_blk(void *vit, const unsigned char *encoded_soft,
+                               unsigned int num_encoded_groups) {
+    convolutional_shim *shim = (convolutional_shim *)vit;
+
+    // don't overwrite our buffer
+    size_t rem = (shim->buf + shim->buf_len) - shim->write_iter;
+    size_t rem_bits = 8 * rem;
+    // this math isn't very clear
+    // here we sort of do the opposite of what liquid-dsp does
+    size_t n_write_bits = num_encoded_groups - (shim->order - 1);
+    if (n_write_bits > rem_bits) {
+        size_t reduction = n_write_bits - rem_bits;
+        num_encoded_groups -= reduction;
+        n_write_bits -= reduction;
+    }
+
+    // what if n_write_bits isn't a multiple of 8?
+    // libcorrect can't start and stop at arbitrary indices...
+    correct_convolutional_decode_soft(
+        shim->conv, encoded_soft, num_encoded_groups * shim->rate, shim->write_iter);
+    shim->write_iter += n_write_bits / 8;
+}
+
+static void chainback_viterbi(void *vit, unsigned char *decoded,
+                              unsigned int num_decoded_bits) {
+    convolutional_shim *shim = (convolutional_shim *)vit;
+
+    // num_decoded_bits not a multiple of 8?
+    // this is a similar problem to update_viterbi_blk
+    // although here we could actually resolve a non-multiple of 8
+    size_t rem = shim->write_iter - shim->read_iter;
+    size_t rem_bits = 8 * rem;
+
+    if (num_decoded_bits > rem_bits) {
+        num_decoded_bits = rem_bits;
+    }
+
+    size_t num_decoded_bytes = (num_decoded_bits % 8)
+                                   ? (num_decoded_bits / 8 + 1)
+                                   : num_decoded_bits / 8;
+    memcpy(decoded, shim->read_iter, num_decoded_bytes);
+
+    shim->read_iter += num_decoded_bytes;
+}
+
+/* Rate 1/2, k = 7 */
+void *create_viterbi27(int num_decoded_bits) {
+    return create_viterbi(num_decoded_bits, 2, 7, r12k7);
+}
+
+void delete_viterbi27(void *vit) { delete_viterbi(vit); }
+
+int init_viterbi27(void *vit, int _) {
+    init_viterbi(vit);
+    return 0;
+}
+
+int update_viterbi27_blk(void *vit, unsigned char *encoded_soft,
+                         int num_encoded_groups) {
+    update_viterbi_blk(vit, encoded_soft, num_encoded_groups);
+    return 0;
+}
+
+int chainback_viterbi27(void *vit, unsigned char *decoded,
+                        unsigned int num_decoded_bits, unsigned int _) {
+    chainback_viterbi(vit, decoded, num_decoded_bits);
+    return 0;
+}
+
+/* Rate 1/2, k = 9 */
+void *create_viterbi29(int num_decoded_bits) {
+    return create_viterbi(num_decoded_bits, 2, 9, r12k9);
+}
+
+void delete_viterbi29(void *vit) { delete_viterbi(vit); }
+
+int init_viterbi29(void *vit, int _) {
+    init_viterbi(vit);
+    return 0;
+}
+
+int update_viterbi29_blk(void *vit, unsigned char *encoded_soft,
+                         int num_encoded_groups) {
+    update_viterbi_blk(vit, encoded_soft, num_encoded_groups);
+    return 0;
+}
+
+int chainback_viterbi29(void *vit, unsigned char *decoded,
+                        unsigned int num_decoded_bits, unsigned int _) {
+    chainback_viterbi(vit, decoded, num_decoded_bits);
+    return 0;
+}
+
+/* Rate 1/3, k = 9 */
+void *create_viterbi39(int num_decoded_bits) {
+    return create_viterbi(num_decoded_bits, 3, 9, r13k9);
+}
+
+void delete_viterbi39(void *vit) { delete_viterbi(vit); }
+
+int init_viterbi39(void *vit, int _) {
+    init_viterbi(vit);
+    return 0;
+}
+
+int update_viterbi39_blk(void *vit, unsigned char *encoded_soft,
+                         int num_encoded_groups) {
+    update_viterbi_blk(vit, encoded_soft, num_encoded_groups);
+    return 0;
+}
+
+int chainback_viterbi39(void *vit, unsigned char *decoded,
+                        unsigned int num_decoded_bits, unsigned int _) {
+    chainback_viterbi(vit, decoded, num_decoded_bits);
+    return 0;
+}
+
+/* Rate 1/6, k = 15 */
+void *create_viterbi615(int num_decoded_bits) {
+    return create_viterbi(num_decoded_bits, 6, 15, r16k15);
+}
+
+void delete_viterbi615(void *vit) { delete_viterbi(vit); }
+
+int init_viterbi615(void *vit, int _) {
+    init_viterbi(vit);
+    return 0;
+}
+
+int update_viterbi615_blk(void *vit, unsigned char *encoded_soft,
+                          int num_encoded_groups) {
+    update_viterbi_blk(vit, encoded_soft, num_encoded_groups);
+    return 0;
+}
+
+int chainback_viterbi615(void *vit, unsigned char *decoded,
+                         unsigned int num_decoded_bits, unsigned int _) {
+    chainback_viterbi(vit, decoded, num_decoded_bits);
+    return 0;
+}
diff --git a/falcon9_decoder/src/libcorrect/fec_shim.h b/falcon9_decoder/src/libcorrect/fec_shim.h
new file mode 100644
index 00000000..b06e463c
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/fec_shim.h
@@ -0,0 +1,74 @@
+#ifndef CORRECT_FEC_H
+#define CORRECT_FEC_H
+// libcorrect's libfec shim header
+// this is a partial implementation of libfec
+// header signatures derived from found usages of libfec -- some things may be different
+#include <correct.h>
+
+// Reed-Solomon
+void *init_rs_char(int symbol_size, int primitive_polynomial, int first_consecutive_root,
+                   int root_gap, int number_roots, unsigned int pad);
+void free_rs_char(void *rs);
+void encode_rs_char(void *rs, const unsigned char *msg, unsigned char *parity);
+void decode_rs_char(void *rs, unsigned char *block, int *erasure_locations, int num_erasures);
+
+// Convolutional Codes
+
+// Polynomials
+// These have been determined via find_conv_libfec_poly.c
+// We could just make up new ones, but we use libfec's here so that
+//   codes encoded by this library can be decoded by the original libfec
+//   and vice-versa
+#define V27POLYA 0155
+#define V27POLYB 0117
+
+#define V29POLYA 0657
+#define V29POLYB 0435
+
+#define V39POLYA 0755
+#define V39POLYB 0633
+#define V39POLYC 0447
+
+#define V615POLYA 042631
+#define V615POLYB 047245
+#define V615POLYC 056507
+#define V615POLYD 073363
+#define V615POLYE 077267
+#define V615POLYF 064537
+
+// Convolutional Methods
+void *create_viterbi27(int num_decoded_bits);
+int init_viterbi27(void *vit, int _mystery);
+int update_viterbi27_blk(void *vit, unsigned char *encoded_soft, int n_encoded_groups);
+int chainback_viterbi27(void *vit, unsigned char *decoded, unsigned int n_decoded_bits, unsigned int _mystery);
+void delete_viterbi27(void *vit);
+
+void *create_viterbi29(int num_decoded_bits);
+int init_viterbi29(void *vit, int _mystery);
+int update_viterbi29_blk(void *vit, unsigned char *encoded_soft, int n_encoded_groups);
+int chainback_viterbi29(void *vit, unsigned char *decoded, unsigned int n_decoded_bits, unsigned int _mystery);
+void delete_viterbi29(void *vit);
+
+void *create_viterbi39(int num_decoded_bits);
+int init_viterbi39(void *vit, int _mystery);
+int update_viterbi39_blk(void *vit, unsigned char *encoded_soft, int n_encoded_groups);
+int chainback_viterbi39(void *vit, unsigned char *decoded, unsigned int n_decoded_bits, unsigned int _mystery);
+void delete_viterbi39(void *vit);
+
+void *create_viterbi615(int num_decoded_bits);
+int init_viterbi615(void *vit, int _mystery);
+int update_viterbi615_blk(void *vit, unsigned char *encoded_soft, int n_encoded_groups);
+int chainback_viterbi615(void *vit, unsigned char *decoded, unsigned int n_decoded_bits, unsigned int _mystery);
+void delete_viterbi615(void *vit);
+
+// Misc other
+static inline int parity(unsigned int x) {
+    /* http://graphics.stanford.edu/~seander/bithacks.html#ParityParallel */
+    x ^= x >> 16;
+    x ^= x >> 8;
+    x ^= x >> 4;
+    x &= 0xf;
+    return (0x6996 >> x) & 1;
+}
+
+#endif
diff --git a/falcon9_decoder/src/libcorrect/reed-solomon/CMakeLists.txt b/falcon9_decoder/src/libcorrect/reed-solomon/CMakeLists.txt
new file mode 100644
index 00000000..eabe75e5
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/reed-solomon/CMakeLists.txt
@@ -0,0 +1,2 @@
+set(SRCFILES polynomial.c reed-solomon.c encode.c decode.c)
+add_library(correct-reed-solomon OBJECT ${SRCFILES})
diff --git a/falcon9_decoder/src/libcorrect/reed-solomon/decode.c b/falcon9_decoder/src/libcorrect/reed-solomon/decode.c
new file mode 100644
index 00000000..ea30b522
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/reed-solomon/decode.c
@@ -0,0 +1,508 @@
+#include "correct/reed-solomon/encode.h"
+
+// calculate all syndromes of the received polynomial at the roots of the generator
+// because we're evaluating at the roots of the generator, and because the transmitted
+//   polynomial was made to be a product of the generator, we know that the transmitted
+//   polynomial is 0 at these roots
+// any nonzero syndromes we find here are the values of the error polynomial evaluated
+//   at these roots, so these values give us a window into the error polynomial. if
+//   these syndromes are all zero, then we can conclude the error polynomial is also
+//   zero. if they're nonzero, then we know our message received an error in transit.
+// returns true if syndromes are all zero
+static bool reed_solomon_find_syndromes(field_t field, polynomial_t msgpoly, field_logarithm_t **generator_root_exp,
+                                        field_element_t *syndromes, size_t min_distance) {
+    bool all_zero = true;
+    memset(syndromes, 0, min_distance * sizeof(field_element_t));
+    for (unsigned int i = 0; i < min_distance; i++) {
+        // profiling reveals that this function takes about 50% of the cpu time of
+        // decoding. so, in order to speed it up a little, we precompute and save
+        // the successive powers of the roots of the generator, which are
+        // located in generator_root_exp
+        field_element_t eval = polynomial_eval_lut(field, msgpoly, generator_root_exp[i]);
+        if (eval) {
+            all_zero = false;
+        }
+        syndromes[i] = eval;
+    }
+    return all_zero;
+}
+
+// Berlekamp-Massey algorithm to find LFSR that describes syndromes
+// returns number of errors and writes the error locator polynomial to rs->error_locator
+static unsigned int reed_solomon_find_error_locator(correct_reed_solomon *rs, size_t num_erasures) {
+    unsigned int numerrors = 0;
+
+    memset(rs->error_locator.coeff, 0, (rs->min_distance + 1) * sizeof(field_element_t));
+
+    // initialize to f(x) = 1
+    rs->error_locator.coeff[0] = 1;
+    rs->error_locator.order = 0;
+
+    memcpy(rs->last_error_locator.coeff, rs->error_locator.coeff, (rs->min_distance + 1) * sizeof(field_element_t));
+    rs->last_error_locator.order = rs->error_locator.order;
+
+    field_element_t discrepancy;
+    field_element_t last_discrepancy = 1;
+    unsigned int delay_length = 1;
+
+    for (unsigned int i = rs->error_locator.order; i < rs->min_distance - num_erasures; i++) {
+        discrepancy = rs->syndromes[i];
+        for (unsigned int j = 1; j <= numerrors; j++) {
+            discrepancy = field_add(rs->field, discrepancy,
+                                    field_mul(rs->field, rs->error_locator.coeff[j], rs->syndromes[i - j]));
+        }
+
+        if (!discrepancy) {
+            // our existing LFSR describes the new syndrome as well
+            // leave it as-is but update the number of delay elements
+            //   so that if a discrepancy occurs later we can eliminate it
+            delay_length++;
+            continue;
+        }
+
+        if (2 * numerrors <= i) {
+            // there's a discrepancy, but we still have room for more taps
+            // lengthen LFSR by one tap and set weight to eliminate discrepancy
+
+            // shift the last locator by the delay length, multiply by discrepancy,
+            //   and divide by the last discrepancy
+            // we move down because we're shifting up, and this prevents overwriting
+            for (int j = rs->last_error_locator.order; j >= 0; j--) {
+                // the bounds here will be ok since we have a headroom of numerrors
+                rs->last_error_locator.coeff[j + delay_length] = field_div(
+                    rs->field, field_mul(rs->field, rs->last_error_locator.coeff[j], discrepancy), last_discrepancy);
+            }
+            for (int j = delay_length - 1; j >= 0; j--) {
+                rs->last_error_locator.coeff[j] = 0;
+            }
+
+            // locator = locator - last_locator
+            // we will also update last_locator to be locator before this loop takes place
+            field_element_t temp;
+            for (int j = 0; j <= (rs->last_error_locator.order + delay_length); j++) {
+                temp = rs->error_locator.coeff[j];
+                rs->error_locator.coeff[j] =
+                    field_add(rs->field, rs->error_locator.coeff[j], rs->last_error_locator.coeff[j]);
+                rs->last_error_locator.coeff[j] = temp;
+            }
+            unsigned int temp_order = rs->error_locator.order;
+            rs->error_locator.order = rs->last_error_locator.order + delay_length;
+            rs->last_error_locator.order = temp_order;
+
+            // now last_locator is locator before we started,
+            //   and locator is (locator - (discrepancy/last_discrepancy) * x^(delay_length) * last_locator)
+
+            numerrors = i + 1 - numerrors;
+            last_discrepancy = discrepancy;
+            delay_length = 1;
+            continue;
+        }
+
+        // no more taps
+        // unlike the previous case, we are preserving last locator,
+        //    but we'll update locator as before
+        // we're basically flattening the two loops from the previous case because
+        //    we no longer need to update last_locator
+        for (int j = rs->last_error_locator.order; j >= 0; j--) {
+            rs->error_locator.coeff[j + delay_length] =
+                field_add(rs->field, rs->error_locator.coeff[j + delay_length],
+                          field_div(rs->field, field_mul(rs->field, rs->last_error_locator.coeff[j], discrepancy),
+                                    last_discrepancy));
+        }
+        rs->error_locator.order = (rs->last_error_locator.order + delay_length > rs->error_locator.order)
+                                      ? rs->last_error_locator.order + delay_length
+                                      : rs->error_locator.order;
+        delay_length++;
+    }
+    return rs->error_locator.order;
+}
+
+// find the roots of the error locator polynomial
+// Chien search
+bool reed_solomon_factorize_error_locator(field_t field, unsigned int num_skip, polynomial_t locator_log, field_element_t *roots,
+                                          field_logarithm_t **element_exp) {
+    // normally it'd be tricky to find all the roots
+    // but, the finite field is awfully finite...
+    // just brute force search across every field element
+    unsigned int root = num_skip;
+    memset(roots + num_skip, 0, (locator_log.order) * sizeof(field_element_t));
+    for (field_operation_t i = 0; i < 256; i++) {
+        // we make two optimizations here to help this search go faster
+        // a) we have precomputed the first successive powers of every single element
+        //   in the field. we need at most n powers, where n is the largest possible
+        //   degree of the error locator
+        // b) we have precomputed the error locator polynomial in log form, which
+        //   helps reduce some lookups that would be done here
+        if (!polynomial_eval_log_lut(field, locator_log, element_exp[i])) {
+            roots[root] = (field_element_t)i;
+            root++;
+        }
+    }
+    // this is where we find out if we are have too many errors to recover from
+    // berlekamp-massey may have built an error locator that has 0 discrepancy
+    // on the syndromes but doesn't have enough roots
+    return root == locator_log.order + num_skip;
+}
+
+// use error locator and syndromes to find the error evaluator polynomial
+void reed_solomon_find_error_evaluator(field_t field, polynomial_t locator, polynomial_t syndromes,
+                                       polynomial_t error_evaluator) {
+    // the error evaluator, omega(x), is S(x)*Lamba(x) mod x^(2t)
+    // where S(x) is a polynomial constructed from the syndromes
+    //   S(1) + S(2)*x + ... + S(2t)*x(2t - 1)
+    // and Lambda(x) is the error locator
+    // the modulo is implicit here -- we have limited the max length of error_evaluator,
+    //   which polynomial_mul will interpret to mean that it should not compute
+    //   powers larger than that, which is the same as performing mod x^(2t)
+    polynomial_mul(field, locator, syndromes, error_evaluator);
+}
+
+// use error locator, error roots and syndromes to find the error values
+// that is, the elements in the finite field which can be added to the received
+//   polynomial at the locations of the error roots in order to produce the
+//   transmitted polynomial
+// forney algorithm
+void reed_solomon_find_error_values(correct_reed_solomon *rs) {
+    // error value e(j) = -(X(j)^(1-c) * omega(X(j)^-1))/(lambda'(X(j)^-1))
+    // where X(j)^-1 is a root of the error locator, omega(X) is the error evaluator,
+    //   lambda'(X) is the first formal derivative of the error locator,
+    //   and c is the first consecutive root of the generator used in encoding
+
+    // first find omega(X), the error evaluator
+    // we generate S(x), the polynomial constructed from the roots of the syndromes
+    // this is *not* the polynomial constructed by expanding the products of roots
+    // S(x) = S(1) + S(2)*x + ... + S(2t)*x(2t - 1)
+    polynomial_t syndrome_poly;
+    syndrome_poly.order = rs->min_distance - 1;
+    syndrome_poly.coeff = rs->syndromes;
+    memset(rs->error_evaluator.coeff, 0, (rs->error_evaluator.order + 1) * sizeof(field_element_t));
+    reed_solomon_find_error_evaluator(rs->field, rs->error_locator, syndrome_poly, rs->error_evaluator);
+
+    // now find lambda'(X)
+    rs->error_locator_derivative.order = rs->error_locator.order - 1;
+    polynomial_formal_derivative(rs->field, rs->error_locator, rs->error_locator_derivative);
+
+    // calculate each e(j)
+    for (unsigned int i = 0; i < rs->error_locator.order; i++) {
+        if (rs->error_roots[i] == 0) {
+            continue;
+        }
+        rs->error_vals[i] = field_mul(
+            rs->field, field_pow(rs->field, rs->error_roots[i], rs->first_consecutive_root - 1),
+            field_div(
+                rs->field, polynomial_eval_lut(rs->field, rs->error_evaluator, rs->element_exp[rs->error_roots[i]]),
+                polynomial_eval_lut(rs->field, rs->error_locator_derivative, rs->element_exp[rs->error_roots[i]])));
+    }
+}
+
+void reed_solomon_find_error_locations(field_t field, field_logarithm_t generator_root_gap,
+                                       field_element_t *error_roots, field_logarithm_t *error_locations,
+                                       unsigned int num_errors, unsigned int num_skip) {
+    for (unsigned int i = 0; i < num_errors; i++) {
+        // the error roots are the reciprocals of the error locations, so div 1 by them
+
+        // we do mod 255 here because the log table aliases at index 1
+        // the log of 1 is both 0 and 255 (alpha^255 = alpha^0 = 1)
+        // for most uses it makes sense to have log(1) = 255, but in this case
+        // we're interested in a byte index, and the 255th index is not even valid
+        // just wrap it back to 0
+
+        if (error_roots[i] == 0) {
+            continue;
+        }
+
+        field_operation_t loc = field_div(field, 1, error_roots[i]);
+        for (field_operation_t j = 0; j < 256; j++) {
+            if (field_pow(field, j, generator_root_gap) == loc) {
+                error_locations[i] = field.log[j];
+                break;
+            }
+        }
+    }
+}
+
+// erasure method -- take given locations and convert to roots
+// this is the inverse of reed_solomon_find_error_locations
+static void reed_solomon_find_error_roots_from_locations(field_t field, field_logarithm_t generator_root_gap,
+                                                         const field_logarithm_t *error_locations,
+                                                         field_element_t *error_roots, unsigned int num_errors) {
+    for (unsigned int i = 0; i < num_errors; i++) {
+        field_element_t loc = field_pow(field, field.exp[error_locations[i]], generator_root_gap);
+        // field_element_t loc = field.exp[error_locations[i]];
+        error_roots[i] = field_div(field, 1, loc);
+        // error_roots[i] = loc;
+    }
+}
+
+// erasure method -- given the roots of the error locator, create the polynomial
+static polynomial_t reed_solomon_find_error_locator_from_roots(field_t field, unsigned int num_errors,
+                                                               field_element_t *error_roots,
+                                                               polynomial_t error_locator,
+                                                               polynomial_t *scratch) {
+    // multiply out roots to build the error locator polynomial
+    return polynomial_init_from_roots(field, num_errors, error_roots, error_locator, scratch);
+}
+
+// erasure method
+static void reed_solomon_find_modified_syndromes(correct_reed_solomon *rs, field_element_t *syndromes, polynomial_t error_locator, field_element_t *modified_syndromes) {
+    polynomial_t syndrome_poly;
+    syndrome_poly.order = rs->min_distance - 1;
+    syndrome_poly.coeff = syndromes;
+
+    polynomial_t modified_syndrome_poly;
+    modified_syndrome_poly.order = rs->min_distance - 1;
+    modified_syndrome_poly.coeff = modified_syndromes;
+
+    polynomial_mul(rs->field, error_locator, syndrome_poly, modified_syndrome_poly);
+}
+
+void correct_reed_solomon_decoder_create(correct_reed_solomon *rs) {
+    rs->has_init_decode = true;
+    rs->syndromes = calloc(rs->min_distance, sizeof(field_element_t));
+    rs->modified_syndromes = calloc(2 * rs->min_distance, sizeof(field_element_t));
+    rs->received_polynomial = polynomial_create(rs->block_length - 1);
+    rs->error_locator = polynomial_create(rs->min_distance);
+    rs->error_locator_log = polynomial_create(rs->min_distance);
+    rs->erasure_locator = polynomial_create(rs->min_distance);
+    rs->error_roots = calloc(2 * rs->min_distance, sizeof(field_element_t));
+    rs->error_vals = malloc(rs->min_distance * sizeof(field_element_t));
+    rs->error_locations = malloc(rs->min_distance * sizeof(field_logarithm_t));
+
+    rs->last_error_locator = polynomial_create(rs->min_distance);
+    rs->error_evaluator = polynomial_create(rs->min_distance - 1);
+    rs->error_locator_derivative = polynomial_create(rs->min_distance - 1);
+
+    // calculate and store the first block_length powers of every generator root
+    // we would have to do this work in order to calculate the syndromes
+    // if we save it, we can prevent the need to recalculate it on subsequent calls
+    // total memory usage is min_distance * block_length bytes e.g. 32 * 255 ~= 8k
+    rs->generator_root_exp = malloc(rs->min_distance * sizeof(field_logarithm_t *));
+    for (unsigned int i = 0; i < rs->min_distance; i++) {
+        rs->generator_root_exp[i] = malloc(rs->block_length * sizeof(field_logarithm_t));
+        polynomial_build_exp_lut(rs->field, rs->generator_roots[i], rs->block_length - 1, rs->generator_root_exp[i]);
+    }
+
+    // calculate and store the first min_distance powers of every element in the field
+    // we would have to do this for chien search anyway, and its size is only 256 * min_distance bytes
+    // for min_distance = 32 this is 8k of memory, a pittance for the speedup we receive in exchange
+    // we also get to reuse this work during error value calculation
+    rs->element_exp = malloc(256 * sizeof(field_logarithm_t *));
+    for (field_operation_t i = 0; i < 256; i++) {
+        rs->element_exp[i] = malloc(rs->min_distance * sizeof(field_logarithm_t));
+        polynomial_build_exp_lut(rs->field, i, rs->min_distance - 1, rs->element_exp[i]);
+    }
+
+    rs->init_from_roots_scratch[0] = polynomial_create(rs->min_distance);
+    rs->init_from_roots_scratch[1] = polynomial_create(rs->min_distance);
+}
+
+ssize_t correct_reed_solomon_decode(correct_reed_solomon *rs, const uint8_t *encoded, size_t encoded_length,
+                                    uint8_t *msg) {
+    if (encoded_length > rs->block_length) {
+        return -1;
+    }
+
+    // the message is the non-remainder part
+    size_t msg_length = encoded_length - rs->min_distance;
+    // if they handed us a nonfull block, we'll write in 0s
+    size_t pad_length = rs->block_length - encoded_length;
+
+    if (!rs->has_init_decode) {
+        // initialize rs for decoding
+        correct_reed_solomon_decoder_create(rs);
+    }
+
+    // we need to copy to our local buffer
+    // the buffer we're given has the coordinates in the wrong direction
+    // e.g. byte 0 corresponds to the 254th order coefficient
+    // so we're going to flip and then write padding
+    // the final copied buffer will look like
+    // | rem (rs->min_distance) | msg (msg_length) | pad (pad_length) |
+
+    for (unsigned int i = 0; i < encoded_length; i++) {
+        rs->received_polynomial.coeff[i] = encoded[encoded_length - (i + 1)];
+    }
+
+    // fill the pad_length with 0s
+    for (unsigned int i = 0; i < pad_length; i++) {
+        rs->received_polynomial.coeff[i + encoded_length] = 0;
+    }
+
+
+    bool all_zero = reed_solomon_find_syndromes(rs->field, rs->received_polynomial, rs->generator_root_exp,
+                                                rs->syndromes, rs->min_distance);
+
+    if (all_zero) {
+        // syndromes were all zero, so there was no error in the message
+        // copy to msg and we are done
+        for (unsigned int i = 0; i < msg_length; i++) {
+            msg[i] = rs->received_polynomial.coeff[encoded_length - (i + 1)];
+        }
+        return msg_length;
+    }
+
+    unsigned int order = reed_solomon_find_error_locator(rs, 0);
+    // XXX fix this vvvv
+    rs->error_locator.order = order;
+
+    for (unsigned int i = 0; i <= rs->error_locator.order; i++) {
+        // this is a little strange since the coeffs are logs, not elements
+        // also, we'll be storing log(0) = 0 for any 0 coeffs in the error locator
+        // that would seem bad but we'll just be using this in chien search, and we'll skip all 0 coeffs
+        // (you might point out that log(1) also = 0, which would seem to alias. however, that's ok,
+        //   because log(1) = 255 as well, and in fact that's how it's represented in our log table)
+        rs->error_locator_log.coeff[i] = rs->field.log[rs->error_locator.coeff[i]];
+    }
+    rs->error_locator_log.order = rs->error_locator.order;
+
+    if (!reed_solomon_factorize_error_locator(rs->field, 0, rs->error_locator_log, rs->error_roots, rs->element_exp)) {
+        // roots couldn't be found, so there were too many errors to deal with
+        // RS has failed for this message
+        return -1;
+    }
+
+    reed_solomon_find_error_locations(rs->field, rs->generator_root_gap, rs->error_roots, rs->error_locations,
+                                      rs->error_locator.order, 0);
+
+    reed_solomon_find_error_values(rs);
+
+    for (unsigned int i = 0; i < rs->error_locator.order; i++) {
+        rs->received_polynomial.coeff[rs->error_locations[i]] =
+            field_sub(rs->field, rs->received_polynomial.coeff[rs->error_locations[i]], rs->error_vals[i]);
+    }
+
+    for (unsigned int i = 0; i < msg_length; i++) {
+        msg[i] = rs->received_polynomial.coeff[encoded_length - (i + 1)];
+    }
+
+    return msg_length;
+}
+
+ssize_t correct_reed_solomon_decode_with_erasures(correct_reed_solomon *rs, const uint8_t *encoded,
+                                                  size_t encoded_length, const uint8_t *erasure_locations,
+                                                  size_t erasure_length, uint8_t *msg) {
+    if (!erasure_length) {
+        return correct_reed_solomon_decode(rs, encoded, encoded_length, msg);
+    }
+
+    if (encoded_length > rs->block_length) {
+        return -1;
+    }
+
+    if (erasure_length > rs->min_distance) {
+        return -1;
+    }
+
+    // the message is the non-remainder part
+    size_t msg_length = encoded_length - rs->min_distance;
+    // if they handed us a nonfull block, we'll write in 0s
+    size_t pad_length = rs->block_length - encoded_length;
+
+    if (!rs->has_init_decode) {
+        // initialize rs for decoding
+        correct_reed_solomon_decoder_create(rs);
+    }
+
+    // we need to copy to our local buffer
+    // the buffer we're given has the coordinates in the wrong direction
+    // e.g. byte 0 corresponds to the 254th order coefficient
+    // so we're going to flip and then write padding
+    // the final copied buffer will look like
+    // | rem (rs->min_distance) | msg (msg_length) | pad (pad_length) |
+
+    for (unsigned int i = 0; i < encoded_length; i++) {
+        rs->received_polynomial.coeff[i] = encoded[encoded_length - (i + 1)];
+    }
+
+    // fill the pad_length with 0s
+    for (unsigned int i = 0; i < pad_length; i++) {
+        rs->received_polynomial.coeff[i + encoded_length] = 0;
+    }
+
+    for (unsigned int i = 0; i < erasure_length; i++) {
+        // remap the coordinates of the erasures
+        rs->error_locations[i] = rs->block_length - (erasure_locations[i] + pad_length + 1);
+    }
+
+    reed_solomon_find_error_roots_from_locations(rs->field, rs->generator_root_gap, rs->error_locations,
+                                                 rs->error_roots, erasure_length);
+
+    rs->erasure_locator =
+        reed_solomon_find_error_locator_from_roots(rs->field, erasure_length, rs->error_roots, rs->erasure_locator, rs->init_from_roots_scratch);
+
+    bool all_zero = reed_solomon_find_syndromes(rs->field, rs->received_polynomial, rs->generator_root_exp,
+                                                rs->syndromes, rs->min_distance);
+
+    if (all_zero) {
+        // syndromes were all zero, so there was no error in the message
+        // copy to msg and we are done
+        for (unsigned int i = 0; i < msg_length; i++) {
+            msg[i] = rs->received_polynomial.coeff[encoded_length - (i + 1)];
+        }
+        return msg_length;
+    }
+
+    reed_solomon_find_modified_syndromes(rs, rs->syndromes, rs->erasure_locator, rs->modified_syndromes);
+
+    field_element_t *syndrome_copy = malloc(rs->min_distance * sizeof(field_element_t));
+    memcpy(syndrome_copy, rs->syndromes, rs->min_distance * sizeof(field_element_t));
+
+    for (unsigned int i = erasure_length; i < rs->min_distance; i++) {
+        rs->syndromes[i - erasure_length] = rs->modified_syndromes[i];
+    }
+
+    unsigned int order = reed_solomon_find_error_locator(rs, erasure_length);
+    // XXX fix this vvvv
+    rs->error_locator.order = order;
+
+    for (unsigned int i = 0; i <= rs->error_locator.order; i++) {
+        // this is a little strange since the coeffs are logs, not elements
+        // also, we'll be storing log(0) = 0 for any 0 coeffs in the error locator
+        // that would seem bad but we'll just be using this in chien search, and we'll skip all 0 coeffs
+        // (you might point out that log(1) also = 0, which would seem to alias. however, that's ok,
+        //   because log(1) = 255 as well, and in fact that's how it's represented in our log table)
+        rs->error_locator_log.coeff[i] = rs->field.log[rs->error_locator.coeff[i]];
+    }
+    rs->error_locator_log.order = rs->error_locator.order;
+
+    /*
+    for (unsigned int i = 0; i < erasure_length; i++) {
+        rs->error_roots[i] = field_div(rs->field, 1, rs->error_roots[i]);
+    }
+    */
+
+    if (!reed_solomon_factorize_error_locator(rs->field, erasure_length, rs->error_locator_log, rs->error_roots, rs->element_exp)) {
+        // roots couldn't be found, so there were too many errors to deal with
+        // RS has failed for this message
+        free(syndrome_copy);
+        return -1;
+    }
+
+    polynomial_t temp_poly = polynomial_create(rs->error_locator.order + erasure_length);
+    polynomial_mul(rs->field, rs->erasure_locator, rs->error_locator, temp_poly);
+    polynomial_t placeholder_poly = rs->error_locator;
+    rs->error_locator = temp_poly;
+
+    reed_solomon_find_error_locations(rs->field, rs->generator_root_gap, rs->error_roots, rs->error_locations,
+                                      rs->error_locator.order, erasure_length);
+
+    memcpy(rs->syndromes, syndrome_copy, rs->min_distance * sizeof(field_element_t));
+
+    reed_solomon_find_error_values(rs);
+
+    for (unsigned int i = 0; i < rs->error_locator.order; i++) {
+        rs->received_polynomial.coeff[rs->error_locations[i]] =
+            field_sub(rs->field, rs->received_polynomial.coeff[rs->error_locations[i]], rs->error_vals[i]);
+    }
+
+    rs->error_locator = placeholder_poly;
+
+    for (unsigned int i = 0; i < msg_length; i++) {
+        msg[i] = rs->received_polynomial.coeff[encoded_length - (i + 1)];
+    }
+
+    polynomial_destroy(temp_poly);
+    free(syndrome_copy);
+
+    return msg_length;
+}
diff --git a/falcon9_decoder/src/libcorrect/reed-solomon/encode.c b/falcon9_decoder/src/libcorrect/reed-solomon/encode.c
new file mode 100644
index 00000000..d4eb6f3a
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/reed-solomon/encode.c
@@ -0,0 +1,34 @@
+#include "correct/reed-solomon/encode.h"
+
+ssize_t correct_reed_solomon_encode(correct_reed_solomon *rs, const uint8_t *msg, size_t msg_length, uint8_t *encoded) {
+    if (msg_length > rs->message_length) {
+        return -1;
+    }
+
+    size_t pad_length = rs->message_length - msg_length;
+    for (unsigned int i = 0; i < msg_length; i++) {
+        // message goes from high order to low order but libcorrect polynomials go low to high
+        // so we reverse on the way in and on the way out
+        // we'd have to do a copy anyway so this reversal should be free
+        rs->encoded_polynomial.coeff[rs->encoded_polynomial.order - (i + pad_length)] = msg[i];
+    }
+
+    // 0-fill the rest of the coefficients -- this length will always be > 0
+    // because the order of this poly is block_length and the msg_length <= message_length
+    // e.g. 255 and 223
+    memset(rs->encoded_polynomial.coeff + (rs->encoded_polynomial.order + 1 - pad_length), 0, pad_length);
+    memset(rs->encoded_polynomial.coeff, 0, (rs->encoded_polynomial.order + 1 - rs->message_length));
+
+    polynomial_mod(rs->field, rs->encoded_polynomial, rs->generator, rs->encoded_remainder);
+
+    // now return byte order to highest order to lowest order
+    for (unsigned int i = 0; i < msg_length; i++) {
+        encoded[i] = rs->encoded_polynomial.coeff[rs->encoded_polynomial.order - (i + pad_length)];
+    }
+
+    for (unsigned int i = 0; i < rs->min_distance; i++) {
+        encoded[msg_length + i] = rs->encoded_remainder.coeff[rs->min_distance - (i + 1)];
+    }
+
+    return rs->block_length;
+}
diff --git a/falcon9_decoder/src/libcorrect/reed-solomon/polynomial.c b/falcon9_decoder/src/libcorrect/reed-solomon/polynomial.c
new file mode 100644
index 00000000..32c2792a
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/reed-solomon/polynomial.c
@@ -0,0 +1,255 @@
+#include "correct/reed-solomon/polynomial.h"
+
+polynomial_t polynomial_create(unsigned int order) {
+    polynomial_t polynomial;
+    polynomial.coeff = malloc(sizeof(field_element_t) * (order + 1));
+    polynomial.order = order;
+    return polynomial;
+}
+
+void polynomial_destroy(polynomial_t polynomial) {
+    free(polynomial.coeff);
+}
+
+// if you want a full multiplication, then make res.order = l.order + r.order
+// but if you just care about a lower order, e.g. mul mod x^i, then you can select
+//    fewer coefficients
+void polynomial_mul(field_t field, polynomial_t l, polynomial_t r, polynomial_t res) {
+    // perform an element-wise multiplication of two polynomials
+    memset(res.coeff, 0, sizeof(field_element_t) * (res.order + 1));
+    for (unsigned int i = 0; i <= l.order; i++) {
+        if (i > res.order) {
+            continue;
+        }
+        unsigned int j_limit = (r.order > res.order - i) ? res.order - i : r.order;
+        for (unsigned int j = 0; j <= j_limit; j++) {
+            // e.g. alpha^5*x * alpha^37*x^2 --> alpha^42*x^3
+            res.coeff[i + j] = field_add(field, res.coeff[i + j], field_mul(field, l.coeff[i], r.coeff[j]));
+        }
+    }
+}
+
+void polynomial_mod(field_t field, polynomial_t dividend, polynomial_t divisor, polynomial_t mod) {
+    // find the polynomial remainder of dividend mod divisor
+    // do long division and return just the remainder (written to mod)
+
+    if (mod.order < dividend.order) {
+        // mod.order must be >= dividend.order (scratch space needed)
+        // this is an error -- catch it in debug?
+        return;
+    }
+    // initialize remainder as dividend
+    memcpy(mod.coeff, dividend.coeff, sizeof(field_element_t) * (dividend.order + 1));
+
+    // XXX make sure divisor[divisor_order] is nonzero
+    field_logarithm_t divisor_leading = field.log[divisor.coeff[divisor.order]];
+    // long division steps along one order at a time, starting at the highest order
+    for (unsigned int i = dividend.order; i > 0; i--) {
+        // look at the leading coefficient of dividend and divisor
+        // if leading coefficient of dividend / leading coefficient of divisor is q
+        //   then the next row of subtraction will be q * divisor
+        // if order of q < 0 then what we have is the remainder and we are done
+        if (i < divisor.order) {
+            break;
+        }
+        if (mod.coeff[i] == 0) {
+            continue;
+        }
+        unsigned int q_order = i - divisor.order;
+        field_logarithm_t q_coeff = field_div_log(field, field.log[mod.coeff[i]], divisor_leading);
+
+        // now that we've chosen q, multiply the divisor by q and subtract from
+        //   our remainder. subtracting in GF(2^8) is XOR, just like addition
+        for (unsigned int j = 0; j <= divisor.order; j++) {
+            if (divisor.coeff[j] == 0) {
+                continue;
+            }
+            // all of the multiplication is shifted up by q_order places
+            mod.coeff[j + q_order] = field_add(field, mod.coeff[j + q_order],
+                        field_mul_log_element(field, field.log[divisor.coeff[j]], q_coeff));
+        }
+    }
+}
+
+void polynomial_formal_derivative(field_t field, polynomial_t poly, polynomial_t der) {
+    // if f(x) = a(n)*x^n + ... + a(1)*x + a(0)
+    // then f'(x) = n*a(n)*x^(n-1) + ... + 2*a(2)*x + a(1)
+    // where n*a(n) = sum(k=1, n, a(n)) e.g. the nth sum of a(n) in GF(2^8)
+
+    // assumes der.order = poly.order - 1
+    memset(der.coeff, 0, sizeof(field_element_t) * (der.order + 1));
+    for (unsigned int i = 0; i <= der.order; i++) {
+        // we're filling in the ith power of der, so we look ahead one power in poly
+        // f(x) = a(i + 1)*x^(i + 1) -> f'(x) = (i + 1)*a(i + 1)*x^i
+        // where (i + 1)*a(i + 1) is the sum of a(i + 1) (i + 1) times, not the product
+        der.coeff[i] = field_sum(field, poly.coeff[i + 1], i + 1);
+    }
+}
+
+field_element_t polynomial_eval(field_t field, polynomial_t poly, field_element_t val) {
+    // evaluate the polynomial poly at a particular element val
+    if (val == 0) {
+        return poly.coeff[0];
+    }
+
+    field_element_t res = 0;
+
+    // we're going to start at 0th order and multiply by val each time
+    field_logarithm_t val_exponentiated = field.log[1];
+    field_logarithm_t val_log = field.log[val];
+
+    for (unsigned int i = 0; i <= poly.order; i++) {
+        if (poly.coeff[i] != 0) {
+            // multiply-accumulate by the next coeff times the next power of val
+            res = field_add(field, res,
+                    field_mul_log_element(field, field.log[poly.coeff[i]], val_exponentiated));
+        }
+        // now advance to the next power
+        val_exponentiated = field_mul_log(field, val_exponentiated, val_log);
+    }
+    return res;
+}
+
+field_element_t polynomial_eval_lut(field_t field, polynomial_t poly, const field_logarithm_t *val_exp) {
+    // evaluate the polynomial poly at a particular element val
+    // in this case, all of the logarithms of the successive powers of val have been precalculated
+    // this removes the extra work we'd have to do to calculate val_exponentiated each time
+    //   if this function is to be called on the same val multiple times
+    if (val_exp[0] == 0) {
+        return poly.coeff[0];
+    }
+
+    field_element_t res = 0;
+
+    for (unsigned int i = 0; i <= poly.order; i++) {
+        if (poly.coeff[i] != 0) {
+            // multiply-accumulate by the next coeff times the next power of val
+            res = field_add(field, res,
+                    field_mul_log_element(field, field.log[poly.coeff[i]], val_exp[i]));
+        }
+    }
+    return res;
+}
+
+field_element_t polynomial_eval_log_lut(field_t field, polynomial_t poly_log, const field_logarithm_t *val_exp) {
+    // evaluate the log_polynomial poly at a particular element val
+    // like polynomial_eval_lut, the logarithms of the successive powers of val have been
+    //   precomputed
+    if (val_exp[0] == 0) {
+        if (poly_log.coeff[0] == 0) {
+            // special case for the non-existant log case
+            return 0;
+        }
+        return field.exp[poly_log.coeff[0]];
+    }
+
+    field_element_t res = 0;
+
+    for (unsigned int i = 0; i <= poly_log.order; i++) {
+        // using 0 as a sentinel value in log -- log(0) is really -inf
+        if (poly_log.coeff[i] != 0) {
+            // multiply-accumulate by the next coeff times the next power of val
+            res = field_add(field, res,
+                    field_mul_log_element(field, poly_log.coeff[i], val_exp[i]));
+        }
+    }
+    return res;
+}
+
+void polynomial_build_exp_lut(field_t field, field_element_t val, unsigned int order, field_logarithm_t *val_exp) {
+    // create the lookup table of successive powers of val used by polynomial_eval_lut
+    field_logarithm_t val_exponentiated = field.log[1];
+    field_logarithm_t val_log = field.log[val];
+    for (unsigned int i = 0; i <= order; i++) {
+        if (val == 0) {
+            val_exp[i] = 0;
+        } else {
+            val_exp[i] = val_exponentiated;
+            val_exponentiated = field_mul_log(field, val_exponentiated, val_log);
+        }
+    }
+}
+
+polynomial_t polynomial_init_from_roots(field_t field, unsigned int nroots, field_element_t *roots, polynomial_t poly, polynomial_t *scratch) {
+    unsigned int order = nroots;
+    polynomial_t l;
+    field_element_t l_coeff[2];
+    l.order = 1;
+    l.coeff = l_coeff;
+
+    // we'll keep two temporary stores of rightside polynomial
+    // each time through the loop, we take the previous result and use it as new rightside
+    // swap back and forth (prevents the need for a copy)
+    polynomial_t r[2];
+    r[0] = scratch[0];
+    r[1] = scratch[1];
+    unsigned int rcoeffres = 0;
+
+    // initialize the result with x + roots[0]
+    r[rcoeffres].coeff[1] = 1;
+    r[rcoeffres].coeff[0] = roots[0];
+    r[rcoeffres].order = 1;
+
+    // initialize lcoeff[1] with x
+    // we'll fill in the 0th order term in each loop iter
+    l.coeff[1] = 1;
+
+    // loop through, using previous run's result as the new right hand side
+    // this allows us to multiply one group at a time
+    for (unsigned int i = 1; i < nroots; i++) {
+        l.coeff[0] = roots[i];
+        unsigned int nextrcoeff = rcoeffres;
+        rcoeffres = (rcoeffres + 1) % 2;
+        r[rcoeffres].order = i + 1;
+        polynomial_mul(field, l, r[nextrcoeff], r[rcoeffres]);
+    }
+
+    memcpy(poly.coeff, r[rcoeffres].coeff, (order + 1) * sizeof(field_element_t));
+    poly.order = order;
+
+    return poly;
+}
+
+polynomial_t polynomial_create_from_roots(field_t field, unsigned int nroots, field_element_t *roots) {
+    polynomial_t poly = polynomial_create(nroots);
+    unsigned int order = nroots;
+    polynomial_t l;
+    l.order = 1;
+    l.coeff = calloc(2, sizeof(field_element_t));
+
+    polynomial_t r[2];
+    // we'll keep two temporary stores of rightside polynomial
+    // each time through the loop, we take the previous result and use it as new rightside
+    // swap back and forth (prevents the need for a copy)
+    r[0].coeff = calloc(order + 1, sizeof(field_element_t));
+    r[1].coeff = calloc(order + 1, sizeof(field_element_t));
+    unsigned int rcoeffres = 0;
+
+    // initialize the result with x + roots[0]
+    r[rcoeffres].coeff[0] = roots[0];
+    r[rcoeffres].coeff[1] = 1;
+    r[rcoeffres].order = 1;
+
+    // initialize lcoeff[1] with x
+    // we'll fill in the 0th order term in each loop iter
+    l.coeff[1] = 1;
+
+    // loop through, using previous run's result as the new right hand side
+    // this allows us to multiply one group at a time
+    for (unsigned int i = 1; i < nroots; i++) {
+        l.coeff[0] = roots[i];
+        unsigned int nextrcoeff = rcoeffres;
+        rcoeffres = (rcoeffres + 1) % 2;
+        r[rcoeffres].order = i + 1;
+        polynomial_mul(field, l, r[nextrcoeff], r[rcoeffres]);
+    }
+
+    memcpy(poly.coeff, r[rcoeffres].coeff, (order + 1) * sizeof(field_element_t));
+    poly.order = order;
+
+    free(l.coeff);
+    free(r[0].coeff);
+    free(r[1].coeff);
+
+    return poly;
+}
diff --git a/falcon9_decoder/src/libcorrect/reed-solomon/reed-solomon.c b/falcon9_decoder/src/libcorrect/reed-solomon/reed-solomon.c
new file mode 100644
index 00000000..91a708e2
--- /dev/null
+++ b/falcon9_decoder/src/libcorrect/reed-solomon/reed-solomon.c
@@ -0,0 +1,187 @@
+#include "correct/reed-solomon/reed-solomon.h"
+
+// coeff must be of size nroots + 1
+// e.g. 2 roots (x + alpha)(x + alpha^2) yields a poly with 3 terms x^2 + g0*x + g1
+static polynomial_t reed_solomon_build_generator(field_t field, unsigned int nroots, field_element_t first_consecutive_root, unsigned int root_gap, polynomial_t generator, field_element_t *roots) {
+    // generator has order 2*t
+    // of form (x + alpha^1)(x + alpha^2)...(x - alpha^2*t)
+    for (unsigned int i = 0; i < nroots; i++) {
+        roots[i] = field.exp[(root_gap * (i + first_consecutive_root)) % 255];
+    }
+    return polynomial_create_from_roots(field, nroots, roots);
+}
+
+correct_reed_solomon *correct_reed_solomon_create(field_operation_t primitive_polynomial, field_logarithm_t first_consecutive_root, field_logarithm_t generator_root_gap, size_t num_roots) {
+    correct_reed_solomon *rs = calloc(1, sizeof(correct_reed_solomon));
+    rs->field = field_create(primitive_polynomial);
+
+    rs->block_length = 255;
+    rs->min_distance = num_roots;
+    rs->message_length = rs->block_length - rs->min_distance;
+
+    rs->first_consecutive_root = first_consecutive_root;
+    rs->generator_root_gap = generator_root_gap;
+
+    rs->generator_roots = malloc(rs->min_distance * sizeof(field_element_t));
+
+    rs->generator = reed_solomon_build_generator(rs->field, rs->min_distance, rs->first_consecutive_root, rs->generator_root_gap, rs->generator, rs->generator_roots);
+
+    rs->encoded_polynomial = polynomial_create(rs->block_length - 1);
+    rs->encoded_remainder = polynomial_create(rs->block_length - 1);
+
+    rs->has_init_decode = false;
+
+    return rs;
+}
+
+void correct_reed_solomon_destroy(correct_reed_solomon *rs) {
+    field_destroy(rs->field);
+    polynomial_destroy(rs->generator);
+    free(rs->generator_roots);
+    polynomial_destroy(rs->encoded_polynomial);
+    polynomial_destroy(rs->encoded_remainder);
+    if (rs->has_init_decode) {
+        free(rs->syndromes);
+        free(rs->modified_syndromes);
+        polynomial_destroy(rs->received_polynomial);
+        polynomial_destroy(rs->error_locator);
+        polynomial_destroy(rs->error_locator_log);
+        polynomial_destroy(rs->erasure_locator);
+        free(rs->error_roots);
+        free(rs->error_vals);
+        free(rs->error_locations);
+        polynomial_destroy(rs->last_error_locator);
+        polynomial_destroy(rs->error_evaluator);
+        polynomial_destroy(rs->error_locator_derivative);
+        for (unsigned int i = 0; i < rs->min_distance; i++) {
+            free(rs->generator_root_exp[i]);
+        }
+        free(rs->generator_root_exp);
+        for (field_operation_t i = 0; i < 256; i++) {
+            free(rs->element_exp[i]);
+        }
+        free(rs->element_exp);
+        polynomial_destroy(rs->init_from_roots_scratch[0]);
+        polynomial_destroy(rs->init_from_roots_scratch[1]);
+    }
+    free(rs);
+}
+
+void correct_reed_solomon_debug_print(correct_reed_solomon *rs) {
+    for (unsigned int i = 0; i < 256; i++) {
+        printf("%3d  %3d    %3d  %3d\n", i, rs->field.exp[i], i, rs->field.log[i]);
+    }
+    printf("\n");
+
+    printf("roots: ");
+    for (unsigned int i = 0; i < rs->min_distance; i++) {
+        printf("%d", rs->generator_roots[i]);
+        if (i < rs->min_distance - 1) {
+            printf(", ");
+        }
+    }
+    printf("\n\n");
+
+    printf("generator: ");
+    for (unsigned int i = 0; i < rs->generator.order + 1; i++) {
+        printf("%d*x^%d", rs->generator.coeff[i], i);
+        if (i < rs->generator.order) {
+            printf(" + ");
+        }
+    }
+    printf("\n\n");
+
+    printf("generator (alpha format): ");
+    for (unsigned int i = rs->generator.order + 1; i > 0; i--) {
+        printf("alpha^%d*x^%d", rs->field.log[rs->generator.coeff[i - 1]], i - 1);
+        if (i > 1) {
+            printf(" + ");
+        }
+    }
+    printf("\n\n");
+
+    printf("remainder: ");
+    bool has_printed = false;
+    for (unsigned int i = 0; i < rs->encoded_remainder.order + 1; i++) {
+        if (!rs->encoded_remainder.coeff[i]) {
+            continue;
+        }
+        if (has_printed) {
+            printf(" + ");
+        }
+        has_printed = true;
+        printf("%d*x^%d", rs->encoded_remainder.coeff[i], i);
+    }
+    printf("\n\n");
+
+    printf("syndromes: ");
+    for (unsigned int i = 0; i < rs->min_distance; i++) {
+        printf("%d", rs->syndromes[i]);
+        if (i < rs->min_distance - 1) {
+            printf(", ");
+        }
+    }
+    printf("\n\n");
+
+    printf("numerrors: %d\n\n", rs->error_locator.order);
+
+    printf("error locator: ");
+    has_printed = false;
+    for (unsigned int i = 0; i < rs->error_locator.order + 1; i++) {
+        if (!rs->error_locator.coeff[i]) {
+            continue;
+        }
+        if (has_printed) {
+            printf(" + ");
+        }
+        has_printed = true;
+        printf("%d*x^%d", rs->error_locator.coeff[i], i);
+    }
+    printf("\n\n");
+
+    printf("error roots: ");
+    for (unsigned int i = 0; i < rs->error_locator.order; i++) {
+        printf("%d@%d", polynomial_eval(rs->field, rs->error_locator, rs->error_roots[i]), rs->error_roots[i]);
+        if (i < rs->error_locator.order - 1) {
+            printf(", ");
+        }
+    }
+    printf("\n\n");
+
+    printf("error evaluator: ");
+    has_printed = false;
+    for (unsigned int i = 0; i < rs->error_evaluator.order; i++) {
+        if (!rs->error_evaluator.coeff[i]) {
+            continue;
+        }
+        if (has_printed) {
+            printf(" + ");
+        }
+        has_printed = true;
+        printf("%d*x^%d", rs->error_evaluator.coeff[i], i);
+    }
+    printf("\n\n");
+
+    printf("error locator derivative: ");
+    has_printed = false;
+    for (unsigned int i = 0; i < rs->error_locator_derivative.order; i++) {
+        if (!rs->error_locator_derivative.coeff[i]) {
+            continue;
+        }
+        if (has_printed) {
+            printf(" + ");
+        }
+        has_printed = true;
+        printf("%d*x^%d", rs->error_locator_derivative.coeff[i], i);
+    }
+    printf("\n\n");
+
+    printf("error locator: ");
+    for (unsigned int i = 0; i < rs->error_locator.order; i++) {
+        printf("%d@%d", rs->error_vals[i], rs->error_locations[i]);
+        if (i < rs->error_locator.order - 1) {
+            printf(", ");
+        }
+    }
+    printf("\n\n");
+}
diff --git a/falcon9_decoder/src/main.cpp b/falcon9_decoder/src/main.cpp
new file mode 100644
index 00000000..48f883be
--- /dev/null
+++ b/falcon9_decoder/src/main.cpp
@@ -0,0 +1,254 @@
+#include <imgui.h>
+#include <watcher.h>
+#include <config.h>
+#include <core.h>
+#include <gui/style.h>
+#include <signal_path/signal_path.h>
+#include <module.h>
+#include <options.h>
+
+#include <dsp/pll.h>
+#include <dsp/stream.h>
+#include <dsp/demodulator.h>
+#include <dsp/window.h>
+#include <dsp/resampling.h>
+#include <dsp/processing.h>
+#include <dsp/routing.h>
+
+#include <dsp/deframing.h>
+#include <falcon_fec.h>
+#include <falcon_packet.h>
+#include <dsp/sink.h>
+
+#include <gui/widgets/symbol_diagram.h>
+
+#define CONCAT(a, b)    ((std::string(a) + b).c_str())
+
+SDRPP_MOD_INFO {
+    /* Name:            */ "falcon9_decoder",
+    /* Description:     */ "Falcon9 telemetry decoder for SDR++",
+    /* Author:          */ "Ryzerth",
+    /* Version:         */ 0, 1, 0,
+    /* Max instances    */ -1
+};
+
+#define INPUT_SAMPLE_RATE   6000000
+
+class Falcon9DecoderModule : public ModuleManager::Instance {
+public:
+    Falcon9DecoderModule(std::string name) {
+        this->name = name;
+
+        vfo = sigpath::vfoManager.createVFO(name, ImGui::WaterfallVFO::REF_CENTER, 0, 4000000, INPUT_SAMPLE_RATE, 1);
+
+        // dsp::Splitter<float> split;
+        // dsp::Reshaper<float> reshape;
+        // dsp::HandlerSink<float> symSink;
+
+        // dsp::stream<float> thrInput;
+        // dsp::Threshold thr;
+
+        demod.init(vfo->output, INPUT_SAMPLE_RATE, 2000000.0f);
+        recov.init(&demod.out, (float)INPUT_SAMPLE_RATE / 3572000.0f, 0.00765625f, 0.175f, 0.005f);
+        split.init(&recov.out);
+        split.bindStream(&reshapeInput);
+        split.bindStream(&thrInput);
+        reshape.init(&reshapeInput, 1024, 198976);
+        symSink.init(&reshape.out, symSinkHandler, this);
+        thr.init(&thrInput);
+        deframe.init(&thr.out, 10232, syncWord, 32);
+        falconRS.init(&deframe.out);
+        pkt.init(&falconRS.out);
+        sink.init(&pkt.out, sinkHandler, this);
+
+        demod.start();
+        recov.start();
+        split.start();
+        reshape.start();
+        symSink.start();
+        sink.start();
+        pkt.start();
+        falconRS.start();
+        deframe.start();
+        thr.start();
+
+        ffplay = _popen("ffplay -framedrop -hide_banner -loglevel panic -window_title \"Falcon 9 Cameras\" -", "wb");
+
+        gui::menu.registerEntry(name, menuHandler, this, this);
+    }
+
+    ~Falcon9DecoderModule() {
+        
+    }
+
+    void enable() {
+        vfo = sigpath::vfoManager.createVFO(name, ImGui::WaterfallVFO::REF_CENTER, 0, 4000000, INPUT_SAMPLE_RATE, 1);
+
+        demod.setInput(vfo->output);
+
+        demod.start();
+        recov.start();
+        split.start();
+        reshape.start();
+        symSink.start();
+        sink.start();
+        pkt.start();
+        falconRS.start();
+        deframe.start();
+        thr.start();
+
+        enabled = true;
+    }
+
+    void disable() {
+        demod.stop();
+        recov.stop();
+        split.stop();
+        reshape.stop();
+        symSink.stop();
+        sink.stop();
+        pkt.stop();
+        falconRS.stop();
+        deframe.stop();
+        thr.stop();
+
+        sigpath::vfoManager.deleteVFO(vfo);
+        enabled = false;
+    }
+
+    bool isEnabled() {
+        return enabled;
+    }
+
+private:
+    static void menuHandler(void* ctx) {
+        Falcon9DecoderModule* _this = (Falcon9DecoderModule*)ctx;
+
+        float menuWidth = ImGui::GetContentRegionAvailWidth();
+
+        if (!_this->enabled) { style::beginDisabled(); }
+
+        ImGui::SetNextItemWidth(menuWidth);
+        _this->symDiag.draw();
+
+        if (_this->logsVisible) {
+            if (ImGui::Button("Hide logs", ImVec2(menuWidth, 0))) { _this->logsVisible = false; }
+        }
+        else {
+            if (ImGui::Button("Show logs", ImVec2(menuWidth, 0))) { _this->logsVisible = true; }
+        }
+
+        if (_this->logsVisible) {
+            std::lock_guard<std::mutex> lck(_this->logsMtx);
+
+            ImGui::Begin("Falcon9 Telemetry");
+            ImGui::BeginTabBar("Falcon9Tabs");
+
+            // GPS Logs
+            ImGui::BeginTabItem("GPS");
+            if (ImGui::Button("Clear logs##GPSClear")) { _this->gpsLogs.clear(); }
+            ImGui::BeginChild(ImGuiID("GPSChild"));
+            ImGui::TextUnformatted(_this->gpsLogs.c_str());
+            ImGui::SetScrollHere(1.0f);
+            ImGui::EndChild();
+            ImGui::EndTabItem();
+
+            // STMM1A Logs
+            ImGui::BeginTabItem("STMM1A");
+
+            ImGui::EndTabItem();
+
+            // STMM1B Logs
+            ImGui::BeginTabItem("STMM1B");
+
+            ImGui::EndTabItem();
+
+            // STMM1C Logs
+            ImGui::BeginTabItem("STMM1C");
+
+            ImGui::EndTabItem();
+
+            ImGui::EndTabBar();
+            ImGui::End();
+        }
+
+        if (!_this->enabled) { style::endDisabled(); }
+    }
+
+    static void sinkHandler(uint8_t* data, int count, void* ctx) {
+        Falcon9DecoderModule* _this = (Falcon9DecoderModule*)ctx;
+
+        uint16_t length = (((data[0] & 0b1111) << 8) | data[1]) + 2;
+        uint64_t pktId =    ((uint64_t)data[2] << 56) | ((uint64_t)data[3] << 48) | ((uint64_t)data[4] << 40) | ((uint64_t)data[5] << 32)
+                        |   ((uint64_t)data[6] << 24) | ((uint64_t)data[7] << 16) | ((uint64_t)data[8] << 8) | data[9];
+
+        if (pktId == 0x0117FE0800320303 || pktId == 0x0112FA0800320303) { 
+            data[length - 2] = 0;
+            _this->logsMtx.lock();
+            _this->gpsLogs += (char*)(data + 25);
+            _this->logsMtx.unlock();
+        }
+        else if (pktId == 0x01123201042E1403) {
+            fwrite(data + 25, 1, 940, _this->ffplay);
+        }
+
+        //printf("%016" PRIX64 ": %d bytes, %d full\n", pktId, length, count);
+    }
+
+    static void symSinkHandler(float* data, int count, void* ctx) {
+        Falcon9DecoderModule* _this = (Falcon9DecoderModule*)ctx;
+        float* buf = _this->symDiag.aquireBuffer();
+        memcpy(buf, data, 1024*sizeof(float));
+        _this->symDiag.releaseBuffer();
+    }
+
+    std::string name;
+    bool enabled = true;
+    
+    bool logsVisible = false;
+
+    std::mutex logsMtx;
+    std::string gpsLogs = "";
+
+    // DSP Chain
+    dsp::FloatFMDemod demod;
+    dsp::MMClockRecovery recov;
+
+    dsp::Splitter<float> split;
+
+    dsp::stream<float> reshapeInput;
+    dsp::Reshaper<float> reshape;
+    dsp::HandlerSink<float> symSink;
+
+    dsp::stream<float> thrInput;
+    dsp::Threshold thr;
+
+    uint8_t syncWord[32] = {0,0,0,1,1,0,1,0,1,1,0,0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,1,1,1,0,1};
+    dsp::Deframer deframe;
+    dsp::FalconRS falconRS;
+    dsp::FalconPacketSync pkt;
+    dsp::HandlerSink<uint8_t> sink;
+
+    FILE* ffplay;
+
+    VFOManager::VFO* vfo;
+
+    ImGui::SymbolDiagram symDiag;
+
+};
+
+MOD_EXPORT void _INIT_() {
+    // Nothing
+}
+
+MOD_EXPORT ModuleManager::Instance* _CREATE_INSTANCE_(std::string name) {
+    return new Falcon9DecoderModule(name);
+}
+
+MOD_EXPORT void _DELETE_INSTANCE_(void* instance) {
+    delete (Falcon9DecoderModule*)instance;
+}
+
+MOD_EXPORT void _END_() {
+    // Nothing either
+}
\ No newline at end of file
diff --git a/make_windows_package.ps1 b/make_windows_package.ps1
index 4b8f0db2..fc39c9bf 100644
--- a/make_windows_package.ps1
+++ b/make_windows_package.ps1
@@ -32,6 +32,9 @@ cp build/soapy_source/Release/soapy_source.dll sdrpp_windows_x64/modules/
 cp build/audio_sink/Release/audio_sink.dll sdrpp_windows_x64/modules/
 cp "C:/Program Files (x86)/RtAudio/bin/rtaudio.dll" sdrpp_windows_x64/
 
+# Copy supporting libs
+cp 'C:/Program Files/PothosSDR/bin/libusb-1.0.dll' sdrpp_windows_x64/
+
 Compress-Archive -Path sdrpp_windows_x64/ -DestinationPath sdrpp_windows_x64.zip
 
 rm -Force -Recurse sdrpp_windows_x64
\ No newline at end of file