Bugfix + added M17 decoder to the linux CI

2025-11-05 19:29:12 +01:00 · 2021-10-02 17:01:23 +02:00
parent 26fa23c8f5
commit b4213ea049
86 changed files with 6601 additions and 20 deletions
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -12,6 +12,8 @@ else ()
 endif ()
 add_definitions(-DSDRPP_IS_CORE)

+add_subdirectory("libcorrect/")
+
 # Main code
 file(GLOB_RECURSE SRC "src/*.cpp" "src/*.c")

@@ -25,6 +27,10 @@ target_compile_definitions(sdrpp_core PUBLIC INSTALL_PREFIX="${CMAKE_INSTALL_PRE
 target_include_directories(sdrpp_core PUBLIC "src/")
 target_include_directories(sdrpp_core PUBLIC "src/imgui")

+# Link to linkcorrect
+target_include_directories(sdrpp_core PUBLIC "libcorrect/include")
+target_link_libraries(sdrpp_core PUBLIC correct_static)
+
 if (OPT_OVERRIDE_STD_FILESYSTEM)
 target_include_directories(sdrpp_core PUBLIC "std_replacement")
 endif (OPT_OVERRIDE_STD_FILESYSTEM)
--- a/core/libcorrect/.appveyor-install-tools.cmd
+++ b/core/libcorrect/.appveyor-install-tools.cmd
@@ -0,0 +1,47 @@
+@echo on
+
+if NOT EXIST C:\projects\tools (
+  mkdir C:\projects\tools
+)
+cd C:\projects\tools
+
+::###########################################################################
+:: Setup Compiler
+::###########################################################################
+if NOT EXIST llvm-installer.exe (
+    appveyor DownloadFile http://prereleases.llvm.org/win-snapshots/LLVM-5.0.0-r306282-win32.exe -FileName llvm-installer.exe
+)
+
+START /WAIT llvm-installer.exe /S /D=C:\"projects\tools\LLVM-install"
+@set PATH="C:\projects\tools\LLVM-install\bin";%PATH%
+clang-cl -v
+
+if DEFINED MINGW_PATH rename "C:\Program Files\Git\usr\bin\sh.exe" "sh-ignored.exe"
+if DEFINED MINGW_PATH @set "PATH=%PATH:C:\Program Files (x86)\Git\bin=%"
+if DEFINED MINGW_PATH @set "PATH=%PATH%;%MINGW_PATH%"
+if DEFINED MINGW_PATH g++ -v
+
+::###########################################################################
+:: Install a recent CMake
+::###########################################################################
+if NOT EXIST cmake (
+  appveyor DownloadFile https://cmake.org/files/v3.7/cmake-3.7.2-win64-x64.zip -FileName cmake.zip
+  7z x cmake.zip -oC:\projects\tools > nul
+  move C:\projects\tools\cmake-* C:\projects\tools\cmake
+  rm cmake.zip
+)
+@set PATH=C:\projects\tools\cmake\bin;%PATH%
+cmake --version
+
+::###########################################################################
+:: Install Ninja
+::###########################################################################
+if NOT EXIST ninja (
+  appveyor DownloadFile https://github.com/ninja-build/ninja/releases/download/v1.6.0/ninja-win.zip -FileName ninja.zip
+  7z x ninja.zip -oC:\projects\tools\ninja > nul
+  rm ninja.zip
+)
+@set PATH=C:\projects\tools\ninja;%PATH%
+ninja --version
+
+@echo off
--- a/core/libcorrect/.gitignore
+++ b/core/libcorrect/.gitignore
@@ -0,0 +1 @@
+build
--- a/core/libcorrect/.travis.yml
+++ b/core/libcorrect/.travis.yml
@@ -0,0 +1,12 @@
+language: c
+matrix:
+    include:
+        - os: linux
+          dist: trusty
+        - os: osx
+script:
+    - mkdir build
+    - cd build
+    - cmake ..
+    - make shim
+    - make check CTEST_OUTPUT_ON_FAILURE=TRUE
--- a/core/libcorrect/CMakeLists.txt
+++ b/core/libcorrect/CMakeLists.txt
@@ -0,0 +1,102 @@
+cmake_minimum_required(VERSION 2.8)
+project(Correct C)
+include(CheckLibraryExists)
+include(CheckIncludeFiles)
+include(CheckCSourceCompiles)
+include(CMakePushCheckState)
+include(CheckCCompilerFlag)
+
+if(MSVC)
+set(LIBM "")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /W4")
+else(MSVC)
+set(LIBM "m")
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -std=c99 -Wall")
+check_c_compiler_flag(-Wpedantic COMPILER_SUPPORTS_WPEDANTIC)
+if(COMPILER_SUPPORTS_WPEDANTIC)
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wpedantic")
+endif()
+if(CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g3 -O0 -fsanitize=address")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-no_pie,")
+else()
+  if("${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2")
+    else()
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2")
+  endif()
+  if(CMAKE_BUILD_TYPE STREQUAL "Profiling")
+      set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -O2 -g3")
+  endif()
+endif()
+endif(MSVC)
+
+find_library(FEC fec)
+CHECK_LIBRARY_EXISTS(FEC dotprod "" HAVE_LIBFEC)
+
+if(NOT CMAKE_CROSSCOMPILING)
+  # Check if host machine can compile with SSE 4.1 intrinsic
+  cmake_push_check_state(RESET)
+  set(CMAKE_REQUIRED_DEFINITIONS -march=native)
+  check_c_source_compiles("
+    #include <x86intrin.h>
+    int main() {
+      __m128i a;
+      __m128i b;
+      __m128i c = _mm_min_epu16(a, b);
+      return 0;
+    }" HAVE_SSE)
+  cmake_pop_check_state()
+endif()
+
+if(HAVE_SSE)
+  set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse4.1")
+endif()
+
+set(CMAKE_CXX_VISIBILITY_PRESET hidden)
+set(CMAKE_VISIBILITY_INLINES_HIDDEN 1)
+set(LIBRARY_OUTPUT_PATH ${PROJECT_BINARY_DIR}/lib)
+set(EXECUTABLE_OUTPUT_PATH ${PROJECT_BINARY_DIR}/bin)
+
+include_directories(${PROJECT_SOURCE_DIR}/include)
+add_subdirectory(src)
+
+set(INSTALL_HEADERS "${PROJECT_BINARY_DIR}/include/correct.h")
+
+add_custom_target(correct-h ALL COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/include/correct.h ${PROJECT_BINARY_DIR}/include/correct.h)
+
+if(HAVE_SSE)
+  set(correct_obj_files $<TARGET_OBJECTS:correct-reed-solomon> $<TARGET_OBJECTS:correct-convolutional> $<TARGET_OBJECTS:correct-convolutional-sse>)
+  set(INSTALL_HEADERS ${INSTALL_HEADERS} ${PROJECT_BINARY_DIR}/include/correct-sse.h)
+  add_custom_target(correct-sse-h ALL COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/include/correct-sse.h ${PROJECT_BINARY_DIR}/include/correct-sse.h)
+else()
+  set(correct_obj_files $<TARGET_OBJECTS:correct-reed-solomon> $<TARGET_OBJECTS:correct-convolutional>)
+endif()
+add_library(correct SHARED ${correct_obj_files})
+add_library(correct_static STATIC ${correct_obj_files})
+set_target_properties(correct_static PROPERTIES OUTPUT_NAME "correct")
+if(HAVE_SSE)
+  target_compile_definitions(correct PUBLIC HAVE_SSE=1)
+  target_compile_definitions(correct_static PUBLIC HAVE_SSE=1)
+endif()
+
+add_subdirectory(util)
+add_subdirectory(tests)
+add_subdirectory(tools)
+# add_subdirectory(benchmarks)
+
+install(TARGETS correct correct_static
+        DESTINATION lib)
+install(FILES ${INSTALL_HEADERS} DESTINATION "${CMAKE_INSTALL_PREFIX}/include")
+
+add_library(fec_shim_static EXCLUDE_FROM_ALL src/fec_shim.c ${correct_obj_files})
+set_target_properties(fec_shim_static PROPERTIES OUTPUT_NAME "fec")
+add_library(fec_shim_shared SHARED EXCLUDE_FROM_ALL src/fec_shim.c ${correct_obj_files})
+set_target_properties(fec_shim_shared PROPERTIES OUTPUT_NAME "fec")
+add_custom_target(fec-shim-h COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/include/fec_shim.h ${PROJECT_BINARY_DIR}/include/fec.h)
+add_custom_target(shim DEPENDS fec_shim_static fec_shim_shared fec-shim-h)
+
+install(TARGETS fec_shim_static fec_shim_shared
+        DESTINATION lib
+        OPTIONAL)
+install(FILES ${PROJECT_BINARY_DIR}/include/fec.h DESTINATION "${CMAKE_INSTALL_PREFIX}/include" OPTIONAL)
--- a/core/libcorrect/LICENSE
+++ b/core/libcorrect/LICENSE
@@ -0,0 +1,12 @@
+Copyright (c) 2016, Brian Armstrong
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
--- a/core/libcorrect/README.md
+++ b/core/libcorrect/README.md
@@ -0,0 +1,38 @@
+[libcorrect](https://github.com/quiet/libcorrect)
+===========
+[![OSX/Linux Build Status](https://travis-ci.org/quiet/libcorrect.svg?branch=master)](https://travis-ci.org/quiet/libcorrect)
+[![Windows Build status](https://ci.appveyor.com/api/projects/status/i3e84jmj00fa5my8/branch/master?svg=true)](https://ci.appveyor.com/project/brian-armstrong/libcorrect/branch/master)
+
+libcorrect is a library for Forward Error Correction. By using libcorrect, you can encode extra redundancy into a packet of data and then send it across a lossy channel. When the packet is received, it can be decoded to recover the original, pre-encoded data.
+
+libcorrect accomplishes this task with two algorithms, [Convolutional codes](https://en.wikipedia.org/wiki/Convolutional_code) and [Reed-Solomon](https://en.wikipedia.org/wiki/Reed%E2%80%93Solomon_error_correction). Convolutional codes are robust to a constant background noise, while Reed-Solomon error correction is effective at dealing with noise that occurs in bursts. These algorithms have played an important role in [telecommunications](https://en.wikipedia.org/wiki/Error_detection_and_correction#Deep-space_telecommunications). libcorrect uses a [Viterbi algorithm](https://en.wikipedia.org/wiki/Viterbi_algorithm) decoder to decode convolutional codes.
+
+libcorrect is a performant, BSD-licensed library. It is also the author's hope that this library's contents could help others learn how its algorithms work.
+
+Design goals
+-----------
+
+1. libcorrect should be a drop-in, BSD-licensed substitute for [libfec](http://www.ka9q.net/code/fec/), which offers similar functionality under the LGPL-license. Although libfec is a fantastic library, the state of LGPL-licensed libraries on mobile devices is somewhat uncertain. For this reason, libcorrect is a completely new approach under the BSD license which supports the same algorithms as libfec. Additionally, libcorrect can be built with a compatibility layer so that libcorrect can be linked in place of libfec.
+
+    Achieving this goal gives [libquiet](https://github.com/quiet/quiet) a fully BSD-/MIT-licensed set of dependencies, which gives libquiet more flexibility in mobile applications.
+
+2. libcorrect should make it easier to investigate how forward error correction works. To accomplish this, libcorrect provides tools to test the fitness of convolutional codes and their polynomials. Additionally, libcorrect should be written in a way that leads to easy understanding of these powerful algorithms. This library's roadmap includes more documentation on how these algorithms work and how to increase their computational performance.
+
+3. libcorrect should explore further into error correction. This goal would help libquiet operate in noisier situations. One approach might be to use parts of libcorrect's Viterbi Algorithm decoder to create a [Turbo code](https://en.wikipedia.org/wiki/Turbo_code) decoder, although this is just an idea and may turn out to be prohibitively difficult.
+
+Build
+-----------
+libcorrect uses CMake, which allows for out-of-source builds. To get started, make sure that you have CMake installed, and then, from libcorrect's source directory, run `mkdir build && cd build && cmake .. && make && make install`. Additionally, if you would like the libfec compatibility layer, you can run `make shim && make install`, though do be cautioned that this can overwrite an existing installation of libfec.
+
+If you are on a host which has `<x86intrin.h>` available, then libcorrect will automatically build its SSE version as well. The SSE headers are provided under `<correct-sse.h>`. For now, it is on the caller of this code to ensure that SSE is available and can be used. libcorrect requires SSE functions up to and including SSE4.
+
+If you have any questions or problems with libcorrect, do not hesitate to open an issue.
+
+-----------
+I'd like to thank Ryan Hitchman and Josh Gao for all of their help and rubber ducking.
+
+A huge thank you goes to [Lucas Teske](https://github.com/racerxdl) for finding all the ways that libcorrect was broken on Windows and to [Denis Golovan](https://github.com/MageSlayer) for finding an error in the returned length of the convolutional code decoder.
+
+
+
+
--- a/core/libcorrect/appveyor.yml
+++ b/core/libcorrect/appveyor.yml
@@ -0,0 +1,43 @@
+version: '{build}'
+
+build:
+    verbosity: detailed
+
+branches:
+    only:
+        - master
+
+environment:
+    matrix:
+        - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
+          COMPILER: cl.exe
+          MSVC_BAT: C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat
+          MSVC_BAT_ARCH: x86
+          GENERATOR: "Visual Studio 14 2015 Win64"
+          APPVEYOR_SAVE_CACHE_ON_ERROR: true
+          DLL_PATH: lib\Release\fec.dll
+
+install:
+    - call "%APPVEYOR_BUILD_FOLDER%\\.appveyor-install-tools.cmd"
+
+before_build:
+    - if DEFINED MSVC_BAT call "%MSVC_BAT%" %MSVC_BAT_ARCH%
+    - cd %APPVEYOR_BUILD_FOLDER%
+
+build_script:
+    - mkdir build
+    - cd build
+    - cmake -G "%GENERATOR%" -DCMAKE_C_COMPILER=%COMPILER% -DCMAKE_CXX_COMPILER=%COMPILER% -DCMAKE_BUILD_TYPE=Release ..
+    - cmake --build . --config Release --target shim
+    - cmake --build . --config Release --target test_runners
+    - dumpbin /EXPORTS %DLL_PATH%
+
+test_script:
+    - cd tests
+    - set CTEST_OUTPUT_ON_FAILURE=1
+    - ctest -C Release
+
+cache:
+    - C:\projects\tools\ninja
+    - C:\projects\tools\cmake
+    - C:\projects\tools\llvm-installer.exe
--- a/core/libcorrect/include/correct-sse.h
+++ b/core/libcorrect/include/correct-sse.h
@@ -0,0 +1,30 @@
+#ifndef CORRECT_SSE_H
+#define CORRECT_SSE_H
+#include <correct.h>
+
+struct correct_convolutional_sse;
+typedef struct correct_convolutional_sse correct_convolutional_sse;
+
+/* SSE versions of libcorrect's convolutional encoder/decoder.
+ * These instances should not be used with the non-sse functions,
+ * and non-sse instances should not be used with the sse functions.
+ */
+
+correct_convolutional_sse *correct_convolutional_sse_create(
+    size_t rate, size_t order, const correct_convolutional_polynomial_t *poly);
+
+void correct_convolutional_sse_destroy(correct_convolutional_sse *conv);
+
+size_t correct_convolutional_sse_encode_len(correct_convolutional_sse *conv, size_t msg_len);
+
+size_t correct_convolutional_sse_encode(correct_convolutional_sse *conv, const uint8_t *msg,
+                                        size_t msg_len, uint8_t *encoded);
+
+ssize_t correct_convolutional_sse_decode(correct_convolutional_sse *conv, const uint8_t *encoded,
+                                         size_t num_encoded_bits, uint8_t *msg);
+
+ssize_t correct_convolutional_sse_decode_soft(correct_convolutional_sse *conv,
+                                              const correct_convolutional_soft_t *encoded,
+                                              size_t num_encoded_bits, uint8_t *msg);
+
+#endif
--- a/core/libcorrect/include/correct.h
+++ b/core/libcorrect/include/correct.h
@@ -0,0 +1,277 @@
+#ifndef CORRECT_H
+#define CORRECT_H
+#include <stdint.h>
+
+#ifndef _MSC_VER
+#include <unistd.h>
+#else
+#include <stddef.h>
+typedef ptrdiff_t ssize_t;
+#endif
+
+
+
+// Convolutional Codes
+
+// Convolutional polynomials are 16 bits wide
+typedef uint16_t correct_convolutional_polynomial_t;
+
+static const correct_convolutional_polynomial_t correct_conv_r12_6_polynomial[] = {073, 061};
+static const correct_convolutional_polynomial_t correct_conv_r12_7_polynomial[] = {0161, 0127};
+static const correct_convolutional_polynomial_t correct_conv_r12_8_polynomial[] = {0225, 0373};
+static const correct_convolutional_polynomial_t correct_conv_r12_9_polynomial[] = {0767, 0545};
+static const correct_convolutional_polynomial_t correct_conv_r13_6_polynomial[] = {053, 075, 047};
+static const correct_convolutional_polynomial_t correct_conv_r13_7_polynomial[] = {0137, 0153,
+                                                                                   0121};
+static const correct_convolutional_polynomial_t correct_conv_r13_8_polynomial[] = {0333, 0257,
+                                                                                   0351};
+static const correct_convolutional_polynomial_t correct_conv_r13_9_polynomial[] = {0417, 0627,
+                                                                                   0675};
+
+typedef uint8_t correct_convolutional_soft_t;
+
+struct correct_convolutional;
+typedef struct correct_convolutional correct_convolutional;
+
+/* correct_convolutional_create allocates and initializes an encoder/decoder for
+ * a convolutional code with the given parameters. This function expects that
+ * poly will contain inv_rate elements. E.g., to create a conv. code instance
+ * with rate 1/2, order 7 and polynomials 0161, 0127, call
+ * correct_convolutional_create(2, 7, []correct_convolutional_polynomial_t{0161, 0127});
+ *
+ * If this call is successful, it returns a non-NULL pointer.
+ */
+correct_convolutional *correct_convolutional_create(size_t inv_rate, size_t order,
+                                                    const correct_convolutional_polynomial_t *poly);
+
+/* correct_convolutional_destroy releases all resources associated
+ * with conv. This pointer should not be used for further calls
+ * after calling destroy.
+ */
+void correct_convolutional_destroy(correct_convolutional *conv);
+
+/* correct_convolutional_encode_len returns the number of *bits*
+ * in a msg_len of given size, in *bytes*. In order to convert
+ * this returned length to bytes, save the result of the length
+ * modulo 8. If it's nonzero, then the length in bytes is
+ * length/8 + 1. If it is zero, then the length is just
+ * length/8.
+ */
+size_t correct_convolutional_encode_len(correct_convolutional *conv, size_t msg_len);
+
+/* correct_convolutional_encode uses the given conv instance to
+ * encode a block of data and write it to encoded. The length of
+ * encoded must be long enough to hold the resulting encoded length,
+ * which can be calculated by calling correct_convolutional_encode_len.
+ * However, this length should first be converted to bytes, as that
+ * function returns the length in bits.
+ *
+ * This function returns the number of bits written to encoded. If
+ * this is not an exact multiple of 8, then it occupies an additional
+ * byte.
+ */
+size_t correct_convolutional_encode(correct_convolutional *conv, const uint8_t *msg, size_t msg_len,
+                                    uint8_t *encoded);
+
+/* correct_convolutional_decode uses the given conv instance to
+ * decode a block encoded by correct_convolutional_encode. This
+ * call can cope with some bits being corrupted. This function
+ * cannot detect if there are too many bits corrupted, however,
+ * and will still write a message even if it is not recovered
+ * correctly. It is up to the user to perform checksums or CRC
+ * in order to guarantee that the decoded message is intact.
+ *
+ * num_encoded_bits should contain the length of encoded in *bits*.
+ * This value need not be an exact multiple of 8. However,
+ * it must be a multiple of the inv_rate used to create
+ * the conv instance.
+ *
+ * This function writes the result to msg, which must be large
+ * enough to hold the decoded message. A good conservative size
+ * for this buffer is the number of encoded bits multiplied by the
+ * rate of the code, e.g. for a rate 1/2 code, divide by 2. This
+ * value should then be converted to bytes to find the correct
+ * length for msg.
+ *
+ * This function returns the number of bytes written to msg. If
+ * it fails, it returns -1.
+ */
+ssize_t correct_convolutional_decode(correct_convolutional *conv, const uint8_t *encoded,
+                                     size_t num_encoded_bits, uint8_t *msg);
+
+/* correct_convolutional_decode_soft uses the given conv instance
+ * to decode a block encoded by correct_convolutional_encode and
+ * then modulated/demodulated to 8-bit symbols. This function expects
+ * that 1 is mapped to 255 and 0 to 0. An erased symbol should be
+ * set to 128. The decoded message may contain errors.
+ *
+ * num_encoded_bits should contain the length of encoded in *bits*.
+ * This value need not be an exact multiple of 8. However,
+ * it must be a multiple of the inv_rate used to create
+ * the conv instance.
+ *
+ * This function writes the result to msg, which must be large
+ * enough to hold the decoded message. A good conservative size
+ * for this buffer is the number of encoded bits multiplied by the
+ * rate of the code, e.g. for a rate 1/2 code, divide by 2. This
+ * value should then be converted to bytes to find the correct
+ * length for msg.
+ *
+ * This function returns the number of bytes written to msg. If
+ * it fails, it returns -1.
+ */
+ssize_t correct_convolutional_decode_soft(correct_convolutional *conv,
+                                          const correct_convolutional_soft_t *encoded,
+                                          size_t num_encoded_bits, uint8_t *msg);
+
+// Reed-Solomon
+
+struct correct_reed_solomon;
+typedef struct correct_reed_solomon correct_reed_solomon;
+
+static const uint16_t correct_rs_primitive_polynomial_8_4_3_2_0 =
+    0x11d;  // x^8 + x^4 + x^3 + x^2 + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_5_3_1_0 =
+    0x12b;  // x^8 + x^5 + x^3 + x + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_5_3_2_0 =
+    0x12d;  // x^8 + x^5 + x^3 + x^2 + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_6_3_2_0 =
+    0x14d;  // x^8 + x^6 + x^3 + x^2 + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_6_4_3_2_1_0 =
+    0x15f;  // x^8 + x^6 + x^4 + x^3 + x^2 + x + 1;
+
+static const uint16_t correct_rs_primitive_polynomial_8_6_5_1_0 =
+    0x163;  // x^8 + x^6 + x^5 + x + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_6_5_2_0 =
+    0x165;  // x^8 + x^6 + x^5 + x^2 + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_6_5_3_0 =
+    0x169;  // x^8 + x^6 + x^5 + x^3 + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_6_5_4_0 =
+    0x171;  // x^8 + x^6 + x^5 + x^4 + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_7_2_1_0 =
+    0x187;  // x^8 + x^7 + x^2 + x + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_7_3_2_0 =
+    0x18d;  // x^8 + x^7 + x^3 + x^2 + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_7_5_3_0 =
+    0x1a9;  // x^8 + x^7 + x^5 + x^3 + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_7_6_1_0 =
+    0x1c3;  // x^8 + x^7 + x^6 + x + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_7_6_3_2_1_0 =
+    0x1cf;  // x^8 + x^7 + x^6 + x^3 + x^2 + x + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_7_6_5_2_1_0 =
+    0x1e7;  // x^8 + x^7 + x^6 + x^5 + x^2 + x + 1
+
+static const uint16_t correct_rs_primitive_polynomial_8_7_6_5_4_2_0 =
+    0x1f5;  // x^8 + x^7 + x^6 + x^5 + x^4 + x^2 + 1
+
+static const uint16_t correct_rs_primitive_polynomial_ccsds =
+    0x187;  // x^8 + x^7 + x^2 + x + 1
+
+/* correct_reed_solomon_create allocates and initializes an
+ * encoder/decoder for a given reed solomon error correction
+ * code. The block size must be 255 bytes with 8-bit symbols.
+ *
+ * This block can repair corrupted bytes. It can handle as
+ * many as num_roots/2 bytes having corruption and still recover
+ * the encoded payload. However, using more num_roots
+ * adds more parity overhead and substantially increases
+ * the computational time for decoding.
+ *
+ * primitive_polynomial should be one of the given values in this
+ * file. Sane values for first_consecutive_root and
+ * generator_root_gap are 1 and 1. Not all combinations of
+ * values produce valid codes.
+ */
+correct_reed_solomon *correct_reed_solomon_create(uint16_t primitive_polynomial,
+                                                  uint8_t first_consecutive_root,
+                                                  uint8_t generator_root_gap,
+                                                  size_t num_roots);
+
+/* correct_reed_solomon_encode uses the rs instance to encode
+ * parity information onto a block of data. msg_length should be
+ * no more than the payload size for one block e.g. no more
+ * than 223 for a (255, 223) code. Shorter blocks will be encoded
+ * with virtual padding where the padding is not emitted.
+ *
+ * encoded should be at least msg_length + parity length bytes long
+ *
+ * It is allowable for msg and encoded to be the same pointer. In
+ * that case, the parity bytes will be written after the msg bytes
+ * end.
+ *
+ * This function returns the number of bytes written to encoded.
+ */
+ssize_t correct_reed_solomon_encode(correct_reed_solomon *rs, const uint8_t *msg, size_t msg_length,
+                                    uint8_t *encoded);
+
+/* correct_reed_solomon_decode uses the rs instance to decode
+ * a payload from a block containing payload and parity bytes.
+ * This function can recover in spite of some bytes being corrupted.
+ *
+ * In most cases, if the block is too corrupted, this function
+ * will return -1 and not perform decoding. It is possible but
+ * unlikely that the payload written to msg will contain
+ * errors when this function returns a positive value.
+ *
+ * msg should be long enough to contain a decoded payload for
+ * this encoded block.
+ *
+ * This function returns a positive number of bytes written to msg
+ * if it has decoded or -1 if it has encountered an error.
+ */
+ssize_t correct_reed_solomon_decode(correct_reed_solomon *rs, const uint8_t *encoded,
+                                    size_t encoded_length, uint8_t *msg);
+
+/* correct_reed_solomon_decode_with_erasures uses the rs
+ * instance to decode a payload from a block containing payload
+ * and parity bytes. Additionally, the user can provide the
+ * indices of bytes which have been suspected to be corrupted.
+ * This erasure information is typically provided by a demodulating
+ * or receiving device. This function can recover with
+ * some additional errors on top of the erasures.
+ *
+ * In order to successfully decode, the quantity
+ * (num_erasures + 2*num_errors) must be less than
+ * num_roots.
+ *
+ * erasure_locations shold contain erasure_length items.
+ * erasure_length should not exceed the number of parity
+ * bytes encoded into this block.
+ *
+ * In most cases, if the block is too corrupted, this function
+ * will return -1 and not perform decoding. It is possible but
+ * unlikely that the payload written to msg will contain
+ * errors when this function returns a positive value.
+ *
+ * msg should be long enough to contain a decoded payload for
+ * this encoded block.
+ *
+ * This function returns a positive number of bytes written to msg
+ * if it has decoded or -1 if it has encountered an error.
+ */
+ssize_t correct_reed_solomon_decode_with_erasures(correct_reed_solomon *rs, const uint8_t *encoded,
+                                                  size_t encoded_length,
+                                                  const uint8_t *erasure_locations,
+                                                  size_t erasure_length, uint8_t *msg);
+
+/* correct_reed_solomon_destroy releases the resources
+ * associated with rs. This pointer should not be
+ * used for any functions after this call.
+ */
+void correct_reed_solomon_destroy(correct_reed_solomon *rs);
+
+#endif
+
--- a/core/libcorrect/include/correct/convolutional.h
+++ b/core/libcorrect/include/correct/convolutional.h
@@ -0,0 +1,28 @@
+#ifndef CORRECT_CONVOLUTIONAL
+#define CORRECT_CONVOLUTIONAL
+#include <stdint.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <string.h>
+#include <limits.h>
+#include <assert.h>
+
+#include "correct.h"
+#include "correct/portable.h"
+
+typedef unsigned int shift_register_t;
+typedef uint16_t polynomial_t;
+typedef uint64_t path_t;
+typedef uint8_t soft_t;
+static const soft_t soft_max = UINT8_MAX;
+
+typedef uint16_t distance_t;
+static const distance_t distance_max = UINT16_MAX;
+
+typedef enum {
+    CORRECT_SOFT_LINEAR,
+    CORRECT_SOFT_QUADRATIC,
+} soft_measurement_t;
+#endif
--- a/core/libcorrect/include/correct/convolutional/bit.h
+++ b/core/libcorrect/include/correct/convolutional/bit.h
@@ -0,0 +1,44 @@
+#ifndef CORRECT_CONVOLUTIONAL_BIT
+#define CORRECT_CONVOLUTIONAL_BIT
+#include "correct/convolutional.h"
+
+typedef struct {
+    uint8_t current_byte;
+    unsigned int current_byte_len;
+    uint8_t *bytes;
+    size_t byte_index;
+    size_t len;
+} bit_writer_t;
+
+bit_writer_t *bit_writer_create(uint8_t *bytes, size_t len);
+
+void bit_writer_reconfigure(bit_writer_t *w, uint8_t *bytes, size_t len);
+
+void bit_writer_destroy(bit_writer_t *w);
+
+void bit_writer_write(bit_writer_t *w, uint8_t val, unsigned int n);
+
+void bit_writer_write_1(bit_writer_t *w, uint8_t val);
+
+void bit_writer_write_bitlist_reversed(bit_writer_t *w, uint8_t *l, size_t len);
+
+void bit_writer_flush_byte(bit_writer_t *w);
+
+size_t bit_writer_length(bit_writer_t *w);
+
+typedef struct {
+    uint8_t current_byte;
+    size_t byte_index;
+    size_t len;
+    size_t current_byte_len;
+    const uint8_t *bytes;
+} bit_reader_t;
+
+bit_reader_t *bit_reader_create(const uint8_t *bytes, size_t len);
+
+void bit_reader_reconfigure(bit_reader_t *r, const uint8_t *bytes, size_t len);
+
+void bit_reader_destroy(bit_reader_t *r);
+
+uint8_t bit_reader_read(bit_reader_t *r, unsigned int n);
+#endif
--- a/core/libcorrect/include/correct/convolutional/convolutional.h
+++ b/core/libcorrect/include/correct/convolutional/convolutional.h
@@ -0,0 +1,40 @@
+#ifndef CORRECT_CONVOLUTIONAL_H
+#define CORRECT_CONVOLUTIONAL_H
+#include "correct/convolutional.h"
+#include "correct/convolutional/bit.h"
+#include "correct/convolutional/metric.h"
+#include "correct/convolutional/lookup.h"
+#include "correct/convolutional/history_buffer.h"
+#include "correct/convolutional/error_buffer.h"
+
+struct correct_convolutional {
+    const unsigned int *table;  // size 2**order
+    size_t rate;                // e.g. 2, 3...
+    size_t order;               // e.g. 7, 9...
+    unsigned int numstates;     // 2**order
+    bit_writer_t *bit_writer;
+    bit_reader_t *bit_reader;
+
+    bool has_init_decode;
+    distance_t *distances;
+    pair_lookup_t pair_lookup;
+    soft_measurement_t soft_measurement;
+    history_buffer *history_buffer;
+    error_buffer_t *errors;
+};
+
+correct_convolutional *_correct_convolutional_init(correct_convolutional *conv,
+                                                   size_t rate, size_t order,
+                                                   const polynomial_t *poly);
+void _correct_convolutional_teardown(correct_convolutional *conv);
+
+// portable versions
+void _convolutional_decode_init(correct_convolutional *conv, unsigned int min_traceback, unsigned int traceback_length, unsigned int renormalize_interval);
+void convolutional_decode_warmup(correct_convolutional *conv, unsigned int sets,
+                                 const uint8_t *soft);
+void convolutional_decode_inner(correct_convolutional *conv, unsigned int sets,
+                                const uint8_t *soft);
+void convolutional_decode_tail(correct_convolutional *conv, unsigned int sets,
+                               const uint8_t *soft);
+#endif
+
--- a/core/libcorrect/include/correct/convolutional/error_buffer.h
+++ b/core/libcorrect/include/correct/convolutional/error_buffer.h
@@ -0,0 +1,15 @@
+#include "correct/convolutional.h"
+
+typedef struct {
+    unsigned int index;
+    distance_t *errors[2];
+    unsigned int num_states;
+
+    const distance_t *read_errors;
+    distance_t *write_errors;
+} error_buffer_t;
+
+error_buffer_t *error_buffer_create(unsigned int num_states);
+void error_buffer_destroy(error_buffer_t *buf);
+void error_buffer_reset(error_buffer_t *buf);
+void error_buffer_swap(error_buffer_t *buf);
--- a/core/libcorrect/include/correct/convolutional/history_buffer.h
+++ b/core/libcorrect/include/correct/convolutional/history_buffer.h
@@ -0,0 +1,59 @@
+#include "correct/convolutional.h"
+#include "correct/convolutional/bit.h"
+
+// ring buffer of path histories
+// generates output bits after accumulating sufficient history
+typedef struct {
+    // history entries must be at least this old to be decoded
+    const unsigned int min_traceback_length;
+    // we'll decode entries in bursts. this tells us the length of the burst
+    const unsigned int traceback_group_length;
+    // we will store a total of cap entries. equal to min_traceback_length +
+    // traceback_group_length
+    const unsigned int cap;
+
+    // how many states in the shift register? this is one of the dimensions of
+    // history table
+    const unsigned int num_states;
+    // what's the high order bit of the shift register?
+    const shift_register_t highbit;
+
+    // history is a compact history representation for every shift register
+    // state,
+    //    one bit per time slice
+    uint8_t **history;
+
+    // which slice are we writing next?
+    unsigned int index;
+
+    // how many valid entries are there?
+    unsigned int len;
+
+    // temporary store of fetched bits
+    uint8_t *fetched;
+
+    // how often should we renormalize?
+    unsigned int renormalize_interval;
+    unsigned int renormalize_counter;
+} history_buffer;
+
+history_buffer *history_buffer_create(unsigned int min_traceback_length,
+                                      unsigned int traceback_group_length,
+                                      unsigned int renormalize_interval,
+                                      unsigned int num_states,
+                                      shift_register_t highbit);
+void history_buffer_destroy(history_buffer *buf);
+void history_buffer_reset(history_buffer *buf);
+void history_buffer_step(history_buffer *buf);
+uint8_t *history_buffer_get_slice(history_buffer *buf);
+shift_register_t history_buffer_search(history_buffer *buf,
+                                       const distance_t *distances,
+                                       unsigned int search_every);
+void history_buffer_traceback(history_buffer *buf, shift_register_t bestpath,
+                              unsigned int min_traceback_length,
+                              bit_writer_t *output);
+void history_buffer_process_skip(history_buffer *buf, distance_t *distances,
+                                 bit_writer_t *output, unsigned int skip);
+void history_buffer_process(history_buffer *buf, distance_t *distances,
+                            bit_writer_t *output);
+void history_buffer_flush(history_buffer *buf, bit_writer_t *output);
--- a/core/libcorrect/include/correct/convolutional/lookup.h
+++ b/core/libcorrect/include/correct/convolutional/lookup.h
@@ -0,0 +1,27 @@
+#ifndef CORRECT_CONVOLUTIONAL_LOOKUP
+#define CORRECT_CONVOLUTIONAL_LOOKUP
+#include "correct/convolutional.h"
+
+typedef unsigned int distance_pair_key_t;
+typedef uint32_t output_pair_t;
+typedef uint32_t distance_pair_t;
+
+typedef struct {
+    distance_pair_key_t *keys;
+    output_pair_t *outputs;
+    output_pair_t output_mask;
+    unsigned int output_width;
+    size_t outputs_len;
+    distance_pair_t *distances;
+} pair_lookup_t;
+
+void fill_table(unsigned int order,
+                unsigned int rate,
+                const polynomial_t *poly,
+                unsigned int *table);
+pair_lookup_t pair_lookup_create(unsigned int rate,
+                                 unsigned int order,
+                                 const unsigned int *table);
+void pair_lookup_destroy(pair_lookup_t pairs);
+void pair_lookup_fill_distance(pair_lookup_t pairs, distance_t *distances);
+#endif
--- a/core/libcorrect/include/correct/convolutional/metric.h
+++ b/core/libcorrect/include/correct/convolutional/metric.h
@@ -0,0 +1,20 @@
+#include "correct/convolutional.h"
+
+// measure the hamming distance of two bit strings
+// implemented as population count of x XOR y
+static inline distance_t metric_distance(unsigned int x, unsigned int y) {
+    return popcount(x ^ y);
+}
+
+static inline distance_t metric_soft_distance_linear(unsigned int hard_x, const uint8_t *soft_y, size_t len) {
+    distance_t dist = 0;
+    for (unsigned int i = 0; i < len; i++) {
+        unsigned int soft_x = ((int8_t)(0) - (hard_x & 1)) & 0xff;
+        hard_x >>= 1;
+        int d = soft_y[i] - soft_x;
+        dist += (d < 0) ? -d : d;
+    }
+    return dist;
+}
+
+distance_t metric_soft_distance_quadratic(unsigned int hard_x, const uint8_t *soft_y, size_t len);
--- a/core/libcorrect/include/correct/convolutional/sse/convolutional.h
+++ b/core/libcorrect/include/correct/convolutional/sse/convolutional.h
@@ -0,0 +1,15 @@
+#include "correct/convolutional/convolutional.h"
+#include "correct/convolutional/sse/lookup.h"
+// BIG HEAPING TODO sort out the include mess
+#include "correct-sse.h"
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+
+struct correct_convolutional_sse {
+    correct_convolutional base_conv;
+    oct_lookup_t oct_lookup;
+};
--- a/core/libcorrect/include/correct/convolutional/sse/lookup.h
+++ b/core/libcorrect/include/correct/convolutional/sse/lookup.h
@@ -0,0 +1,65 @@
+#include "correct/convolutional/lookup.h"
+#ifdef _MSC_VER
+#include <intrin.h>
+#else
+#include <x86intrin.h>
+#endif
+
+typedef unsigned int distance_quad_key_t;
+typedef unsigned int output_quad_t;
+typedef uint64_t distance_quad_t;
+
+typedef struct {
+    distance_quad_key_t *keys;
+    output_quad_t *outputs;
+    output_quad_t output_mask;
+    unsigned int output_width;
+    size_t outputs_len;
+    distance_quad_t *distances;
+} quad_lookup_t;
+
+typedef uint16_t distance_oct_key_t;
+typedef uint64_t output_oct_t;
+typedef uint64_t distance_oct_t;
+
+typedef struct {
+    distance_oct_key_t *keys;
+    output_oct_t *outputs;
+    output_oct_t output_mask;
+    unsigned int output_width;
+    size_t outputs_len;
+    distance_oct_t *distances;
+} oct_lookup_t;
+
+quad_lookup_t quad_lookup_create(unsigned int rate,
+                                 unsigned int order,
+                                 const unsigned int *table);
+void quad_lookup_destroy(quad_lookup_t quads);
+void quad_lookup_fill_distance(quad_lookup_t quads, distance_t *distances);
+distance_oct_key_t oct_lookup_find_key(output_oct_t *outputs, output_oct_t out, size_t num_keys);
+oct_lookup_t oct_lookup_create(unsigned int rate,
+                                 unsigned int order,
+                                 const unsigned int *table);
+void oct_lookup_destroy(oct_lookup_t octs);
+static inline void oct_lookup_fill_distance(oct_lookup_t octs, distance_t *distances) {
+    distance_pair_t *pairs = (distance_pair_t*)octs.distances;
+    for (unsigned int i = 1; i < octs.outputs_len; i += 1) {
+        output_oct_t concat_out = octs.outputs[i];
+        unsigned int i_0 = concat_out & 0xff;
+        unsigned int i_1 = (concat_out >> 8) & 0xff;
+        unsigned int i_2 = (concat_out >> 16) & 0xff;
+        unsigned int i_3 = (concat_out >> 24) & 0xff;
+
+        pairs[i*4 + 1] = distances[i_3] << 16 | distances[i_2];
+        pairs[i*4 + 0] = distances[i_1] << 16 | distances[i_0];
+
+        concat_out >>= 32;
+        unsigned int i_4 = concat_out & 0xff;
+        unsigned int i_5 = (concat_out >> 8) & 0xff;
+        unsigned int i_6 = (concat_out >> 16) & 0xff;
+        unsigned int i_7 = (concat_out >> 24) & 0xff;
+
+        pairs[i*4 + 3] = distances[i_7] << 16 | distances[i_6];
+        pairs[i*4 + 2] = distances[i_5] << 16 | distances[i_4];
+    }
+}
--- a/core/libcorrect/include/correct/portable.h
+++ b/core/libcorrect/include/correct/portable.h
@@ -0,0 +1,20 @@
+#ifdef __GNUC__
+#define HAVE_BUILTINS
+#endif
+
+
+#ifdef HAVE_BUILTINS
+#define popcount __builtin_popcount
+#define prefetch __builtin_prefetch
+#else
+
+static inline int popcount(int x) {
+    /* taken from the helpful http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel */
+    x = x - ((x >> 1) & 0x55555555);
+    x = (x & 0x33333333) + ((x >> 2) & 0x33333333);
+    return ((x + (x >> 4) & 0x0f0f0f0f) * 0x01010101) >> 24;
+}
+
+static inline void prefetch(void *x) {}
+
+#endif
--- a/core/libcorrect/include/correct/reed-solomon.h
+++ b/core/libcorrect/include/correct/reed-solomon.h
@@ -0,0 +1,76 @@
+#ifndef CORRECT_REED_SOLOMON
+#define CORRECT_REED_SOLOMON
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include <time.h>
+#include <stdint.h>
+
+#include "correct.h"
+#include "correct/portable.h"
+
+// an element in GF(2^8)
+typedef uint8_t field_element_t;
+
+// a power of the primitive element alpha
+typedef uint8_t field_logarithm_t;
+
+// give us some bits of headroom to do arithmetic
+// variables of this type aren't really in any proper space
+typedef uint16_t field_operation_t;
+
+// generated by find_poly
+typedef struct {
+    const field_element_t *exp;
+    const field_logarithm_t *log;
+} field_t;
+
+typedef struct {
+    field_element_t *coeff;
+    unsigned int order;
+} polynomial_t;
+
+struct correct_reed_solomon {
+    size_t block_length;
+    size_t message_length;
+    size_t min_distance;
+
+    field_logarithm_t first_consecutive_root;
+    field_logarithm_t generator_root_gap;
+
+    field_t field;
+
+    polynomial_t generator;
+    field_element_t *generator_roots;
+    field_logarithm_t **generator_root_exp;
+
+    polynomial_t encoded_polynomial;
+    polynomial_t encoded_remainder;
+
+    field_element_t *syndromes;
+    field_element_t *modified_syndromes;
+    polynomial_t received_polynomial;
+    polynomial_t error_locator;
+    polynomial_t error_locator_log;
+    polynomial_t erasure_locator;
+    field_element_t *error_roots;
+    field_element_t *error_vals;
+    field_logarithm_t *error_locations;
+
+    field_logarithm_t **element_exp;
+
+    // scratch
+    // (do no allocations at steady state)
+
+    // used during find_error_locator
+    polynomial_t last_error_locator;
+
+    // used during error value search
+    polynomial_t error_evaluator;
+    polynomial_t error_locator_derivative;
+    polynomial_t init_from_roots_scratch[2];
+    bool has_init_decode;
+
+};
+#endif
--- a/core/libcorrect/include/correct/reed-solomon/decode.h
+++ b/core/libcorrect/include/correct/reed-solomon/decode.h
@@ -0,0 +1,3 @@
+#include "correct/reed-solomon.h"
+#include "correct/reed-solomon/field.h"
+#include "correct/reed-solomon/polynomial.h"
--- a/core/libcorrect/include/correct/reed-solomon/encode.h
+++ b/core/libcorrect/include/correct/reed-solomon/encode.h
@@ -0,0 +1,3 @@
+#include "correct/reed-solomon.h"
+#include "correct/reed-solomon/field.h"
+#include "correct/reed-solomon/polynomial.h"
--- a/core/libcorrect/include/correct/reed-solomon/field.h
+++ b/core/libcorrect/include/correct/reed-solomon/field.h
@@ -0,0 +1,167 @@
+#ifndef CORRECT_REED_SOLOMON_FIELD
+#define CORRECT_REED_SOLOMON_FIELD
+#include "correct/reed-solomon.h"
+
+/*
+field_t field_create(field_operation_t primitive_poly);
+void field_destroy(field_t field);
+field_element_t field_add(field_t field, field_element_t l, field_element_t r);
+field_element_t field_sub(field_t field, field_element_t l, field_element_t r);
+field_element_t field_sum(field_t field, field_element_t elem, unsigned int n);
+field_element_t field_mul(field_t field, field_element_t l, field_element_t r);
+field_element_t field_div(field_t field, field_element_t l, field_element_t r);
+field_logarithm_t field_mul_log(field_t field, field_logarithm_t l, field_logarithm_t r);
+field_logarithm_t field_div_log(field_t field, field_logarithm_t l, field_logarithm_t r);
+field_element_t field_mul_log_element(field_t field, field_logarithm_t l, field_logarithm_t r);
+field_element_t field_pow(field_t field, field_element_t elem, int pow);
+*/
+
+static inline field_element_t field_mul_log_element(field_t field, field_logarithm_t l, field_logarithm_t r) {
+    // like field_mul_log, but returns a field_element_t
+    // because we are doing lookup here, we can safely skip the wrapover check
+    field_operation_t res = (field_operation_t)l + (field_operation_t)r;
+    return field.exp[res];
+}
+
+static inline field_t field_create(field_operation_t primitive_poly) {
+    // in GF(2^8)
+    // log and exp
+    // bits are in GF(2), compute alpha^val in GF(2^8)
+    // exp should be of size 512 so that it can hold a "wraparound" which prevents some modulo ops
+    // log should be of size 256. no wraparound here, the indices into this table are field elements
+    field_element_t *exp = malloc(512 * sizeof(field_element_t));
+    field_logarithm_t *log = malloc(256 * sizeof(field_logarithm_t));
+
+    // assume alpha is a primitive element, p(x) (primitive_poly) irreducible in GF(2^8)
+    // addition is xor
+    // subtraction is addition (also xor)
+    // e.g. x^5 + x^4 + x^4 + x^2 + 1 = x^5 + x^2 + 1
+    // each row of exp contains the field element found by exponentiating
+    //   alpha by the row index
+    // each row of log contains the coefficients of
+    //   alpha^7 + alpha^6 + alpha^5 + alpha^4 + alpha^3 + alpha^2 + alpha + 1
+    // as 8 bits packed into one byte
+
+    field_operation_t element = 1;
+    exp[0] = (field_element_t)element;
+    log[0] = (field_logarithm_t)0;  // really, it's undefined. we shouldn't ever access this
+    for (field_operation_t i = 1; i < 512; i++) {
+        element = element * 2;
+        element = (element > 255) ? (element ^ primitive_poly) : element;
+        exp[i] = (field_element_t)element;
+        if (i < 256) {
+            log[element] = (field_logarithm_t)i;
+        }
+    }
+
+    field_t field;
+    *(field_element_t **)&field.exp = exp;
+    *(field_logarithm_t **)&field.log = log;
+
+    return field;
+}
+
+static inline void field_destroy(field_t field) {
+    free(*(field_element_t **)&field.exp);
+    free(*(field_element_t **)&field.log);
+}
+
+static inline field_element_t field_add(field_t field, field_element_t l, field_element_t r) {
+    return l ^ r;
+}
+
+static inline field_element_t field_sub(field_t field, field_element_t l, field_element_t r) {
+    return l ^ r;
+}
+
+static inline field_element_t field_sum(field_t field, field_element_t elem, unsigned int n) {
+    // we'll do a closed-form expression of the sum, although we could also
+    //   choose to call field_add n times
+
+    // since the sum is actually the bytewise XOR operator, this suggests two
+    // kinds of values: n odd, and n even
+
+    // if you sum once, you have coeff
+    // if you sum twice, you have coeff XOR coeff = 0
+    // if you sum thrice, you are back at coeff
+    // an even number of XORs puts you at 0
+    // an odd number of XORs puts you back at your value
+
+    // so, just throw away all the even n
+    return (n % 2) ? elem : 0;
+}
+
+static inline field_element_t field_mul(field_t field, field_element_t l, field_element_t r) {
+    if (l == 0 || r == 0) {
+        return 0;
+    }
+    // multiply two field elements by adding their logarithms.
+    // yep, get your slide rules out
+    field_operation_t res = (field_operation_t)field.log[l] + (field_operation_t)field.log[r];
+
+    // if coeff exceeds 255, we would normally have to wrap it back around
+    // alpha^255 = 1; alpha^256 = alpha^255 * alpha^1 = alpha^1
+    // however, we've constructed exponentiation table so that
+    //   we can just directly lookup this result
+    // the result must be clamped to [0, 511]
+    // the greatest we can see at this step is alpha^255 * alpha^255
+    //   = alpha^510
+    return field.exp[res];
+}
+
+static inline field_element_t field_div(field_t field, field_element_t l, field_element_t r) {
+    if (l == 0) {
+        return 0;
+    }
+
+    if (r == 0) {
+        // XXX ???
+        return 0;
+    }
+
+    // division as subtraction of logarithms
+
+    // if rcoeff is larger, then log[l] - log[r] wraps under
+    // so, instead, always add 255. in some cases, we'll wrap over, but
+    // that's ok because the exp table runs up to 511.
+    field_operation_t res = (field_operation_t)255 + (field_operation_t)field.log[l] - (field_operation_t)field.log[r];
+    return field.exp[res];
+}
+
+static inline field_logarithm_t field_mul_log(field_t field, field_logarithm_t l, field_logarithm_t r) {
+    // this function performs the equivalent of field_mul on two logarithms
+    // we save a little time by skipping the lookup step at the beginning
+    field_operation_t res = (field_operation_t)l + (field_operation_t)r;
+
+    // because we arent using the table, the value we return must be a valid logarithm
+    // which we have decided must live in [0, 255] (they are 8-bit values)
+    // ensuring this makes it so that multiple muls will not reach past the end of the
+    // exp table whenever we finally convert back to an element
+    if (res > 255) {
+        return (field_logarithm_t)(res - 255);
+    }
+    return (field_logarithm_t)res;
+}
+
+static inline field_logarithm_t field_div_log(field_t field, field_logarithm_t l, field_logarithm_t r) {
+    // like field_mul_log, this performs field_div without going through a field_element_t
+    field_operation_t res = (field_operation_t)255 + (field_operation_t)l - (field_operation_t)r;
+    if (res > 255) {
+        return (field_logarithm_t)(res - 255);
+    }
+    return (field_logarithm_t)res;
+}
+
+static inline field_element_t field_pow(field_t field, field_element_t elem, int pow) {
+    // take the logarithm, multiply, and then "exponentiate"
+    // n.b. the exp table only considers powers of alpha, the primitive element
+    // but here we have an arbitrary coeff
+    field_logarithm_t log = field.log[elem];
+    int res_log = log * pow;
+    int mod = res_log % 255;
+    if (mod < 0) {
+        mod += 255;
+    }
+    return field.exp[mod];
+}
+#endif
--- a/core/libcorrect/include/correct/reed-solomon/polynomial.h
+++ b/core/libcorrect/include/correct/reed-solomon/polynomial.h
@@ -0,0 +1,14 @@
+#include "correct/reed-solomon.h"
+#include "correct/reed-solomon/field.h"
+
+polynomial_t polynomial_create(unsigned int order);
+void polynomial_destroy(polynomial_t polynomial);
+void polynomial_mul(field_t field, polynomial_t l, polynomial_t r, polynomial_t res);
+void polynomial_mod(field_t field, polynomial_t dividend, polynomial_t divisor, polynomial_t mod);
+void polynomial_formal_derivative(field_t field, polynomial_t poly, polynomial_t der);
+field_element_t polynomial_eval(field_t field, polynomial_t poly, field_element_t val);
+field_element_t polynomial_eval_lut(field_t field, polynomial_t poly, const field_logarithm_t *val_exp);
+field_element_t polynomial_eval_log_lut(field_t field, polynomial_t poly_log, const field_logarithm_t *val_exp);
+void polynomial_build_exp_lut(field_t field, field_element_t val, unsigned int order, field_logarithm_t *val_exp);
+polynomial_t polynomial_init_from_roots(field_t field, unsigned int nroots, field_element_t *roots, polynomial_t poly, polynomial_t *scratch);
+polynomial_t polynomial_create_from_roots(field_t field, unsigned int nroots, field_element_t *roots);
--- a/core/libcorrect/include/correct/reed-solomon/reed-solomon.h
+++ b/core/libcorrect/include/correct/reed-solomon/reed-solomon.h
@@ -0,0 +1,3 @@
+#include "correct/reed-solomon.h"
+#include "correct/reed-solomon/field.h"
+#include "correct/reed-solomon/polynomial.h"
--- a/core/libcorrect/include/correct/util/error-sim-fec.h
+++ b/core/libcorrect/include/correct/util/error-sim-fec.h
@@ -0,0 +1,8 @@
+#include "correct/util/error-sim.h"
+
+#include <fec.h>
+
+void conv_fec27_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
+void conv_fec29_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
+void conv_fec39_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
+void conv_fec615_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
--- a/core/libcorrect/include/correct/util/error-sim-shim.h
+++ b/core/libcorrect/include/correct/util/error-sim-shim.h
@@ -0,0 +1,7 @@
+#include "correct/util/error-sim.h"
+#include "fec_shim.h"
+
+ssize_t conv_shim27_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
+ssize_t conv_shim29_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
+ssize_t conv_shim39_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
+ssize_t conv_shim615_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
--- a/core/libcorrect/include/correct/util/error-sim-sse.h
+++ b/core/libcorrect/include/correct/util/error-sim-sse.h
@@ -0,0 +1,7 @@
+#include "correct/util/error-sim.h"
+
+#include "correct-sse.h"
+
+size_t conv_correct_sse_enclen(void *conv_v, size_t msg_len);
+void conv_correct_sse_encode(void *conv_v, uint8_t *msg, size_t msg_len, uint8_t *encoded);
+ssize_t conv_correct_sse_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
--- a/core/libcorrect/include/correct/util/error-sim.h
+++ b/core/libcorrect/include/correct/util/error-sim.h
@@ -0,0 +1,47 @@
+#include <stdbool.h>
+#include <math.h>
+#include <string.h>
+#include <stdlib.h>
+#include <float.h>
+#include <stdio.h>
+
+#include "correct.h"
+#include "correct/portable.h"
+
+size_t distance(uint8_t *a, uint8_t *b, size_t len);
+void gaussian(double *res, size_t n_res, double sigma);
+
+void encode_bpsk(uint8_t *msg, double *voltages, size_t n_syms, double bpsk_voltage);
+void byte2bit(uint8_t *bytes, uint8_t *bits, size_t n_bits);
+void decode_bpsk(uint8_t *soft, uint8_t *msg, size_t n_syms);
+void decode_bpsk_soft(double *voltages, uint8_t *soft, size_t n_syms, double bpsk_voltage);
+double log2amp(double l);
+double amp2log(double a);
+double sigma_for_eb_n0(double eb_n0, double bpsk_bit_energy);
+void build_white_noise(double *noise, size_t n_syms, double eb_n0, double bpsk_bit_energy);
+void add_white_noise(double *signal, double *noise, size_t n_syms);
+
+typedef struct {
+    uint8_t *msg_out;
+    size_t msg_len;
+    uint8_t *encoded;
+    double *v;
+    double *corrupted;
+    uint8_t *soft;
+    double *noise;
+    size_t enclen;
+    size_t enclen_bytes;
+    void (*encode)(void *, uint8_t *msg, size_t msg_len, uint8_t *encoded);
+    void *encoder;
+    ssize_t (*decode)(void *, uint8_t *soft, size_t soft_len, uint8_t *msg);
+    void *decoder;
+} conv_testbench;
+
+conv_testbench *resize_conv_testbench(conv_testbench *scratch, size_t (*enclen)(void *, size_t), void *enc, size_t msg_len);
+void free_scratch(conv_testbench *scratch);
+int test_conv_noise(conv_testbench *scratch, uint8_t *msg, size_t n_bytes,
+                    double bpsk_voltage);
+
+size_t conv_correct_enclen(void *conv_v, size_t msg_len);
+void conv_correct_encode(void *conv_v, uint8_t *msg, size_t msg_len, uint8_t *encoded);
+ssize_t conv_correct_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg);
--- a/core/libcorrect/include/fec_shim.h
+++ b/core/libcorrect/include/fec_shim.h
@@ -0,0 +1,74 @@
+#ifndef CORRECT_FEC_H
+#define CORRECT_FEC_H
+// libcorrect's libfec shim header
+// this is a partial implementation of libfec
+// header signatures derived from found usages of libfec -- some things may be different
+#include <correct.h>
+
+// Reed-Solomon
+void *init_rs_char(int symbol_size, int primitive_polynomial, int first_consecutive_root,
+                   int root_gap, int number_roots, unsigned int pad);
+void free_rs_char(void *rs);
+void encode_rs_char(void *rs, const unsigned char *msg, unsigned char *parity);
+void decode_rs_char(void *rs, unsigned char *block, int *erasure_locations, int num_erasures);
+
+// Convolutional Codes
+
+// Polynomials
+// These have been determined via find_conv_libfec_poly.c
+// We could just make up new ones, but we use libfec's here so that
+//   codes encoded by this library can be decoded by the original libfec
+//   and vice-versa
+#define V27POLYA 0155
+#define V27POLYB 0117
+
+#define V29POLYA 0657
+#define V29POLYB 0435
+
+#define V39POLYA 0755
+#define V39POLYB 0633
+#define V39POLYC 0447
+
+#define V615POLYA 042631
+#define V615POLYB 047245
+#define V615POLYC 056507
+#define V615POLYD 073363
+#define V615POLYE 077267
+#define V615POLYF 064537
+
+// Convolutional Methods
+void *create_viterbi27(int num_decoded_bits);
+int init_viterbi27(void *vit, int _mystery);
+int update_viterbi27_blk(void *vit, unsigned char *encoded_soft, int n_encoded_groups);
+int chainback_viterbi27(void *vit, unsigned char *decoded, unsigned int n_decoded_bits, unsigned int _mystery);
+void delete_viterbi27(void *vit);
+
+void *create_viterbi29(int num_decoded_bits);
+int init_viterbi29(void *vit, int _mystery);
+int update_viterbi29_blk(void *vit, unsigned char *encoded_soft, int n_encoded_groups);
+int chainback_viterbi29(void *vit, unsigned char *decoded, unsigned int n_decoded_bits, unsigned int _mystery);
+void delete_viterbi29(void *vit);
+
+void *create_viterbi39(int num_decoded_bits);
+int init_viterbi39(void *vit, int _mystery);
+int update_viterbi39_blk(void *vit, unsigned char *encoded_soft, int n_encoded_groups);
+int chainback_viterbi39(void *vit, unsigned char *decoded, unsigned int n_decoded_bits, unsigned int _mystery);
+void delete_viterbi39(void *vit);
+
+void *create_viterbi615(int num_decoded_bits);
+int init_viterbi615(void *vit, int _mystery);
+int update_viterbi615_blk(void *vit, unsigned char *encoded_soft, int n_encoded_groups);
+int chainback_viterbi615(void *vit, unsigned char *decoded, unsigned int n_decoded_bits, unsigned int _mystery);
+void delete_viterbi615(void *vit);
+
+// Misc other
+static inline int parity(unsigned int x) {
+    /* http://graphics.stanford.edu/~seander/bithacks.html#ParityParallel */
+    x ^= x >> 16;
+    x ^= x >> 8;
+    x ^= x >> 4;
+    x &= 0xf;
+    return (0x6996 >> x) & 1;
+}
+
+#endif
--- a/core/libcorrect/src/CMakeLists.txt
+++ b/core/libcorrect/src/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(convolutional)
+add_subdirectory(reed-solomon)
--- a/core/libcorrect/src/convolutional/CMakeLists.txt
+++ b/core/libcorrect/src/convolutional/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(SRCFILES bit.c metric.c history_buffer.c error_buffer.c lookup.c convolutional.c encode.c decode.c)
+add_library(correct-convolutional OBJECT ${SRCFILES})
+if(HAVE_SSE)
+    add_subdirectory(sse)
+endif()
--- a/core/libcorrect/src/convolutional/bit.c
+++ b/core/libcorrect/src/convolutional/bit.c
@@ -0,0 +1,232 @@
+#include "correct/convolutional/bit.h"
+
+bit_writer_t *bit_writer_create(uint8_t *bytes, size_t len) {
+    bit_writer_t *w = calloc(1, sizeof(bit_writer_t));
+
+    if (bytes) {
+        bit_writer_reconfigure(w, bytes, len);
+    }
+
+    return w;
+}
+
+void bit_writer_reconfigure(bit_writer_t *w, uint8_t *bytes, size_t len) {
+    w->bytes = bytes;
+    w->len = len;
+
+    w->current_byte = 0;
+    w->current_byte_len = 0;
+    w->byte_index = 0;
+}
+
+void bit_writer_destroy(bit_writer_t *w) {
+    free(w);
+}
+
+void bit_writer_write(bit_writer_t *w, uint8_t val, unsigned int n) {
+    for (size_t j = 0; j < n; j++) {
+        bit_writer_write_1(w, val);
+        val >>= 1;
+    }
+}
+
+void bit_writer_write_1(bit_writer_t *w, uint8_t val) {
+    w->current_byte |= val & 1;
+    w->current_byte_len++;
+
+    if (w->current_byte_len == 8) {
+        // 8 bits in a byte -- move to the next byte
+        w->bytes[w->byte_index] = w->current_byte;
+        w->byte_index++;
+        w->current_byte_len = 0;
+        w->current_byte = 0;
+    } else {
+        w->current_byte <<= 1;
+    }
+}
+
+void bit_writer_write_bitlist(bit_writer_t *w, uint8_t *l, size_t len) {
+    // first close the current byte
+    // we might have been given too few elements to do that. be careful.
+    size_t close_len = 8 - w->current_byte_len;
+    close_len = (close_len < len) ? close_len : len;
+
+    uint16_t b = w->current_byte;
+
+    for (ptrdiff_t i = 0; i < close_len; i++) {
+        b |= l[i];
+        b <<= 1;
+    }
+
+
+    l += close_len;
+    len -= close_len;
+
+    uint8_t *bytes = w->bytes;
+    size_t byte_index = w->byte_index;
+
+    if (w->current_byte_len + close_len == 8) {
+        b >>= 1;
+        bytes[byte_index] = b;
+        byte_index++;
+    } else {
+        w->current_byte = b;
+        w->current_byte_len += close_len;
+        return;
+    }
+
+    size_t full_bytes = len/8;
+
+    for (size_t i = 0; i < full_bytes; i++) {
+        bytes[byte_index] = l[0] << 7 | l[1] << 6 | l[2] << 5 |
+                            l[3] << 4 | l[4] << 3 | l[5] << 2 |
+                            l[6] << 1 | l[7];
+        byte_index += 1;
+        l += 8;
+    }
+
+    len -= 8*full_bytes;
+
+    b = 0;
+    for (ptrdiff_t i = 0; i < len; i++) {
+        b |= l[i];
+        b <<= 1;
+    }
+
+    w->current_byte = b;
+    w->byte_index = byte_index;
+    w->current_byte_len = len;
+}
+
+void bit_writer_write_bitlist_reversed(bit_writer_t *w, uint8_t *l, size_t len) {
+    l = l + len - 1;
+
+    uint8_t *bytes = w->bytes;
+    size_t byte_index = w->byte_index;
+    uint16_t b;
+
+    if (w->current_byte_len != 0) {
+        size_t close_len = 8 - w->current_byte_len;
+        close_len = (close_len < len) ? close_len : len;
+
+        b = w->current_byte;
+
+        for (ptrdiff_t i = 0; i < close_len; i++) {
+            b |= *l;
+            b <<= 1;
+            l--;
+        }
+
+        len -= close_len;
+
+        if (w->current_byte_len + close_len == 8) {
+            b >>= 1;
+            bytes[byte_index] = b;
+            byte_index++;
+        } else {
+            w->current_byte = b;
+            w->current_byte_len += close_len;
+            return;
+        }
+    }
+
+    size_t full_bytes = len/8;
+
+    for (size_t i = 0; i < full_bytes; i++) {
+        bytes[byte_index] = l[0] << 7 | l[-1] << 6 | l[-2] << 5 |
+                            l[-3] << 4 | l[-4] << 3 | l[-5] << 2 |
+                            l[-6] << 1 | l[-7];
+        byte_index += 1;
+        l -= 8;
+    }
+
+    len -= 8*full_bytes;
+
+    b = 0;
+    for (ptrdiff_t i = 0; i < len; i++) {
+        b |= *l;
+        b <<= 1;
+        l--;
+    }
+
+    w->current_byte = (uint8_t)b;
+    w->byte_index = byte_index;
+    w->current_byte_len = len;
+}
+
+void bit_writer_flush_byte(bit_writer_t *w) {
+    if (w->current_byte_len != 0) {
+        w->current_byte <<= (8 - w->current_byte_len);
+        w->bytes[w->byte_index] = w->current_byte;
+        w->byte_index++;
+        w->current_byte_len = 0;
+    }
+}
+
+size_t bit_writer_length(bit_writer_t *w) {
+    return w->byte_index;
+}
+
+uint8_t reverse_byte(uint8_t b) {
+    return (b & 0x80) >> 7 | (b & 0x40) >> 5 | (b & 0x20) >> 3 |
+           (b & 0x10) >> 1 | (b & 0x08) << 1 | (b & 0x04) << 3 |
+           (b & 0x02) << 5 | (b & 0x01) << 7;
+}
+
+static uint8_t reverse_table[256];
+
+void create_reverse_table() {
+    for (uint16_t i = 0; i < 256; i++) {
+        reverse_table[i] = reverse_byte(i);
+    }
+}
+
+bit_reader_t *bit_reader_create(const uint8_t *bytes, size_t len) {
+    bit_reader_t *r = calloc(1, sizeof(bit_reader_t));
+
+    static bool reverse_table_created = false;
+
+    if (!reverse_table_created) {
+        create_reverse_table();
+        reverse_table_created = true;
+    }
+
+    if (bytes) {
+        bit_reader_reconfigure(r, bytes, len);
+    }
+
+    return r;
+}
+
+void bit_reader_reconfigure(bit_reader_t *r, const uint8_t *bytes, size_t len) {
+    r->bytes = bytes;
+    r->len = len;
+
+    r->current_byte_len = 8;
+    r->current_byte = bytes[0];
+    r->byte_index = 0;
+}
+
+void bit_reader_destroy(bit_reader_t *r) {
+    free(r);
+}
+
+uint8_t bit_reader_read(bit_reader_t *r, unsigned int n) {
+    unsigned int read = 0;
+    unsigned int n_copy = n;
+
+    if (r->current_byte_len < n) {
+        read = r->current_byte & ((1 << r->current_byte_len) - 1);
+        r->byte_index++;
+        r->current_byte = r->bytes[r->byte_index];
+        n -= r->current_byte_len;
+        r->current_byte_len = 8;
+        read <<= n;
+    }
+
+    uint8_t copy_mask = (1 << n) - 1;
+    copy_mask <<= (r->current_byte_len - n);
+    read |= (r->current_byte & copy_mask) >> (r->current_byte_len - n);
+    r->current_byte_len -= n;
+    return reverse_table[read] >> (8 - n_copy);
+}
--- a/core/libcorrect/src/convolutional/convolutional.c
+++ b/core/libcorrect/src/convolutional/convolutional.c
@@ -0,0 +1,59 @@
+#include "correct/convolutional/convolutional.h"
+
+// https://www.youtube.com/watch?v=b3_lVSrPB6w
+
+correct_convolutional *_correct_convolutional_init(correct_convolutional *conv,
+                                                   size_t rate, size_t order,
+                                                   const polynomial_t *poly) {
+    if (order > 8 * sizeof(shift_register_t)) {
+        // XXX turn this into an error code
+        // printf("order must be smaller than 8 * sizeof(shift_register_t)\n");
+        return NULL;
+    }
+    if (rate < 2) {
+        // XXX turn this into an error code
+        // printf("rate must be 2 or greater\n");
+        return NULL;
+    }
+
+    conv->order = order;
+    conv->rate = rate;
+    conv->numstates = 1 << order;
+
+    unsigned int *table = malloc(sizeof(unsigned int) * (1 << order));
+    fill_table(conv->rate, conv->order, poly, table);
+    *(unsigned int **)&conv->table = table;
+
+    conv->bit_writer = bit_writer_create(NULL, 0);
+    conv->bit_reader = bit_reader_create(NULL, 0);
+
+    conv->has_init_decode = false;
+    return conv;
+}
+
+correct_convolutional *correct_convolutional_create(size_t rate, size_t order,
+                                                    const polynomial_t *poly) {
+    correct_convolutional *conv = malloc(sizeof(correct_convolutional));
+    correct_convolutional *init_conv = _correct_convolutional_init(conv, rate, order, poly);
+    if (!init_conv) {
+        free(conv);
+    }
+    return init_conv;
+}
+
+void _correct_convolutional_teardown(correct_convolutional *conv) {
+    free(*(unsigned int **)&conv->table);
+    bit_writer_destroy(conv->bit_writer);
+    bit_reader_destroy(conv->bit_reader);
+    if (conv->has_init_decode) {
+        pair_lookup_destroy(conv->pair_lookup);
+        history_buffer_destroy(conv->history_buffer);
+        error_buffer_destroy(conv->errors);
+        free(conv->distances);
+    }
+}
+
+void correct_convolutional_destroy(correct_convolutional *conv) {
+    _correct_convolutional_teardown(conv);
+    free(conv);
+}
--- a/core/libcorrect/src/convolutional/decode.c
+++ b/core/libcorrect/src/convolutional/decode.c
@@ -0,0 +1,321 @@
+#include "correct/convolutional/convolutional.h"
+
+void conv_decode_print_iter(correct_convolutional *conv, unsigned int iter,
+                            unsigned int winner_index) {
+    if (iter < 2220) {
+        return;
+    }
+    printf("iteration: %d\n", iter);
+    distance_t *errors = conv->errors->write_errors;
+    printf("errors:\n");
+    for (shift_register_t i = 0; i < conv->numstates / 2; i++) {
+        printf("%2d: %d\n", i, errors[i]);
+    }
+    printf("\n");
+    printf("history:\n");
+    for (shift_register_t i = 0; i < conv->numstates / 2; i++) {
+        printf("%2d: ", i);
+        for (unsigned int j = 0; j <= winner_index; j++) {
+            printf("%d", conv->history_buffer->history[j][i] ? 1 : 0);
+        }
+        printf("\n");
+    }
+    printf("\n");
+}
+
+void convolutional_decode_warmup(correct_convolutional *conv, unsigned int sets,
+                                 const uint8_t *soft) {
+    // first phase: load shiftregister up from 0 (order goes from 1 to conv->order)
+    // we are building up error metrics for the first order bits
+    for (unsigned int i = 0; i < conv->order - 1 && i < sets; i++) {
+        // peel off rate bits from encoded to recover the same `out` as in the encoding process
+        // the difference being that this `out` will have the channel noise/errors applied
+        unsigned int out;
+        if (!soft) {
+            out = bit_reader_read(conv->bit_reader, conv->rate);
+        }
+        const distance_t *read_errors = conv->errors->read_errors;
+        distance_t *write_errors = conv->errors->write_errors;
+        // walk all of the state we have so far
+        for (size_t j = 0; j < (1 << (i + 1)); j += 1) {
+            unsigned int last = j >> 1;
+            distance_t dist;
+            if (soft) {
+                if (conv->soft_measurement == CORRECT_SOFT_LINEAR) {
+                    dist = metric_soft_distance_linear(conv->table[j], soft + i * conv->rate,
+                                                       conv->rate);
+                } else {
+                    dist = metric_soft_distance_quadratic(conv->table[j], soft + i * conv->rate,
+                                                          conv->rate);
+                }
+            } else {
+                dist = metric_distance(conv->table[j], out);
+            }
+            write_errors[j] = dist + read_errors[last];
+        }
+        error_buffer_swap(conv->errors);
+    }
+}
+
+void convolutional_decode_inner(correct_convolutional *conv, unsigned int sets,
+                                const uint8_t *soft) {
+    shift_register_t highbit = 1 << (conv->order - 1);
+    for (unsigned int i = conv->order - 1; i < (sets - conv->order + 1); i++) {
+        distance_t *distances = conv->distances;
+        // lasterrors are the aggregate bit errors for the states of shiftregister for the previous
+        // time slice
+        if (soft) {
+            if (conv->soft_measurement == CORRECT_SOFT_LINEAR) {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_linear(j, soft + i * conv->rate, conv->rate);
+                }
+            } else {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_quadratic(j, soft + i * conv->rate, conv->rate);
+                }
+            }
+        } else {
+            unsigned int out = bit_reader_read(conv->bit_reader, conv->rate);
+            for (unsigned int i = 0; i < 1 << (conv->rate); i++) {
+                distances[i] = metric_distance(i, out);
+            }
+        }
+        pair_lookup_t pair_lookup = conv->pair_lookup;
+        pair_lookup_fill_distance(pair_lookup, distances);
+
+        // a mask to get the high order bit from the shift register
+        unsigned int num_iter = highbit << 1;
+        const distance_t *read_errors = conv->errors->read_errors;
+        // aggregate bit errors for this time slice
+        distance_t *write_errors = conv->errors->write_errors;
+
+        uint8_t *history = history_buffer_get_slice(conv->history_buffer);
+        // walk through all states, ignoring oldest bit
+        // we will track a best register state (path) and the number of bit errors at that path at
+        // this time slice
+        // this loop considers two paths per iteration (high order bit set, clear)
+        // so, it only runs numstates/2 iterations
+        // we'll update the history for every state and find the path with the least aggregated bit
+        // errors
+
+        // now run the main loop
+        // we calculate 2 sets of 2 register states here (4 states per iter)
+        // this creates 2 sets which share a predecessor, and 2 sets which share a successor
+        //
+        // the first set definition is the two states that are the same except for the least order
+        // bit
+        // these two share a predecessor because their high n - 1 bits are the same (differ only by
+        // newest bit)
+        //
+        // the second set definition is the two states that are the same except for the high order
+        // bit
+        // these two share a successor because the oldest high order bit will be shifted out, and
+        // the other bits will be present in the successor
+        //
+        shift_register_t highbase = highbit >> 1;
+        for (shift_register_t low = 0, high = highbit, base = 0; high < num_iter;
+             low += 8, high += 8, base += 4) {
+            // shifted-right ancestors
+            // low and low_plus_one share low_past_error
+            //   note that they are the same when shifted right by 1
+            // same goes for high and high_plus_one
+            for (shift_register_t offset = 0, base_offset = 0; base_offset < 4;
+                 offset += 2, base_offset += 1) {
+                distance_pair_key_t low_key = pair_lookup.keys[base + base_offset];
+                distance_pair_key_t high_key = pair_lookup.keys[highbase + base + base_offset];
+                distance_pair_t low_concat_dist = pair_lookup.distances[low_key];
+                distance_pair_t high_concat_dist = pair_lookup.distances[high_key];
+
+                distance_t low_past_error = read_errors[base + base_offset];
+                distance_t high_past_error = read_errors[highbase + base + base_offset];
+
+                distance_t low_error = (low_concat_dist & 0xffff) + low_past_error;
+                distance_t high_error = (high_concat_dist & 0xffff) + high_past_error;
+
+                shift_register_t successor = low + offset;
+                distance_t error;
+                uint8_t history_mask;
+                if (low_error <= high_error) {
+                    error = low_error;
+                    history_mask = 0;
+                } else {
+                    error = high_error;
+                    history_mask = 1;
+                }
+                write_errors[successor] = error;
+                history[successor] = history_mask;
+
+                shift_register_t low_plus_one = low + offset + 1;
+
+                distance_t low_plus_one_error = (low_concat_dist >> 16) + low_past_error;
+                distance_t high_plus_one_error = (high_concat_dist >> 16) + high_past_error;
+
+                shift_register_t plus_one_successor = low_plus_one;
+                distance_t plus_one_error;
+                uint8_t plus_one_history_mask;
+                if (low_plus_one_error <= high_plus_one_error) {
+                    plus_one_error = low_plus_one_error;
+                    plus_one_history_mask = 0;
+                } else {
+                    plus_one_error = high_plus_one_error;
+                    plus_one_history_mask = 1;
+                }
+                write_errors[plus_one_successor] = plus_one_error;
+                history[plus_one_successor] = plus_one_history_mask;
+            }
+        }
+
+        history_buffer_process(conv->history_buffer, write_errors, conv->bit_writer);
+        error_buffer_swap(conv->errors);
+    }
+}
+
+void convolutional_decode_tail(correct_convolutional *conv, unsigned int sets,
+                               const uint8_t *soft) {
+    // flush state registers
+    // now we only shift in 0s, skipping 1-successors
+    shift_register_t highbit = 1 << (conv->order - 1);
+    for (unsigned int i = sets - conv->order + 1; i < sets; i++) {
+        // lasterrors are the aggregate bit errors for the states of shiftregister for the previous
+        // time slice
+        const distance_t *read_errors = conv->errors->read_errors;
+        // aggregate bit errors for this time slice
+        distance_t *write_errors = conv->errors->write_errors;
+
+        uint8_t *history = history_buffer_get_slice(conv->history_buffer);
+
+        // calculate the distance from all output states to our sliced bits
+        distance_t *distances = conv->distances;
+        if (soft) {
+            if (conv->soft_measurement == CORRECT_SOFT_LINEAR) {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_linear(j, soft + i * conv->rate, conv->rate);
+                }
+            } else {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_quadratic(j, soft + i * conv->rate, conv->rate);
+                }
+            }
+        } else {
+            unsigned int out = bit_reader_read(conv->bit_reader, conv->rate);
+            for (unsigned int i = 0; i < 1 << (conv->rate); i++) {
+                distances[i] = metric_distance(i, out);
+            }
+        }
+        const unsigned int *table = conv->table;
+
+        // a mask to get the high order bit from the shift register
+        unsigned int num_iter = highbit << 1;
+        unsigned int skip = 1 << (conv->order - (sets - i));
+        unsigned int base_skip = skip >> 1;
+
+        shift_register_t highbase = highbit >> 1;
+        for (shift_register_t low = 0, high = highbit, base = 0; high < num_iter;
+             low += skip, high += skip, base += base_skip) {
+            unsigned int low_output = table[low];
+            unsigned int high_output = table[high];
+            distance_t low_dist = distances[low_output];
+            distance_t high_dist = distances[high_output];
+
+            distance_t low_past_error = read_errors[base];
+            distance_t high_past_error = read_errors[highbase + base];
+
+            distance_t low_error = low_dist + low_past_error;
+            distance_t high_error = high_dist + high_past_error;
+
+            shift_register_t successor = low;
+            distance_t error;
+            uint8_t history_mask;
+            if (low_error < high_error) {
+                error = low_error;
+                history_mask = 0;
+            } else {
+                error = high_error;
+                history_mask = 1;
+            }
+            write_errors[successor] = error;
+            history[successor] = history_mask;
+        }
+
+        history_buffer_process_skip(conv->history_buffer, write_errors, conv->bit_writer, skip);
+        error_buffer_swap(conv->errors);
+    }
+}
+
+void _convolutional_decode_init(correct_convolutional *conv, unsigned int min_traceback,
+                                unsigned int traceback_length, unsigned int renormalize_interval) {
+    conv->has_init_decode = true;
+
+    conv->distances = calloc(1 << (conv->rate), sizeof(distance_t));
+    conv->pair_lookup = pair_lookup_create(conv->rate, conv->order, conv->table);
+
+    conv->soft_measurement = CORRECT_SOFT_LINEAR;
+
+    // we limit history to go back as far as 5 * the order of our polynomial
+    conv->history_buffer = history_buffer_create(min_traceback, traceback_length, renormalize_interval,
+                                                 conv->numstates / 2, 1 << (conv->order - 1));
+
+    conv->errors = error_buffer_create(conv->numstates);
+}
+
+static ssize_t _convolutional_decode(correct_convolutional *conv, size_t num_encoded_bits,
+                                     size_t num_encoded_bytes, uint8_t *msg,
+                                     const soft_t *soft_encoded) {
+    if (!conv->has_init_decode) {
+        uint64_t max_error_per_input = conv->rate * soft_max;
+        unsigned int renormalize_interval = distance_max / max_error_per_input;
+        _convolutional_decode_init(conv, 5 * conv->order, 15 * conv->order, renormalize_interval);
+    }
+
+    size_t sets = num_encoded_bits / conv->rate;
+    // XXX fix this vvvvvv
+    size_t decoded_len_bytes = num_encoded_bytes;
+    bit_writer_reconfigure(conv->bit_writer, msg, decoded_len_bytes);
+
+    error_buffer_reset(conv->errors);
+    history_buffer_reset(conv->history_buffer);
+
+    // no outputs are generated during warmup
+    convolutional_decode_warmup(conv, sets, soft_encoded);
+    convolutional_decode_inner(conv, sets, soft_encoded);
+    convolutional_decode_tail(conv, sets, soft_encoded);
+
+    history_buffer_flush(conv->history_buffer, conv->bit_writer);
+
+    return bit_writer_length(conv->bit_writer);
+}
+
+// perform viterbi decoding
+// hard decoder
+ssize_t correct_convolutional_decode(correct_convolutional *conv, const uint8_t *encoded,
+                                     size_t num_encoded_bits, uint8_t *msg) {
+    if (num_encoded_bits % conv->rate) {
+        // XXX turn this into an error code
+        // printf("encoded length of message must be a multiple of rate\n");
+        return -1;
+    }
+
+    size_t num_encoded_bytes =
+        (num_encoded_bits % 8) ? (num_encoded_bits / 8 + 1) : (num_encoded_bits / 8);
+    bit_reader_reconfigure(conv->bit_reader, encoded, num_encoded_bytes);
+
+    return _convolutional_decode(conv, num_encoded_bits, num_encoded_bytes, msg, NULL);
+}
+
+ssize_t correct_convolutional_decode_soft(correct_convolutional *conv, const soft_t *encoded,
+                                          size_t num_encoded_bits, uint8_t *msg) {
+    if (num_encoded_bits % conv->rate) {
+        // XXX turn this into an error code
+        // printf("encoded length of message must be a multiple of rate\n");
+        return -1;
+    }
+
+    size_t num_encoded_bytes =
+        (num_encoded_bits % 8) ? (num_encoded_bits / 8 + 1) : (num_encoded_bits / 8);
+
+    return _convolutional_decode(conv, num_encoded_bits, num_encoded_bytes, msg, encoded);
+}
--- a/core/libcorrect/src/convolutional/encode.c
+++ b/core/libcorrect/src/convolutional/encode.c
@@ -0,0 +1,61 @@
+#include "correct/convolutional/convolutional.h"
+
+size_t correct_convolutional_encode_len(correct_convolutional *conv, size_t msg_len) {
+    size_t msgbits = 8 * msg_len;
+    size_t encodedbits = conv->rate * (msgbits + conv->order + 1);
+    return encodedbits;
+}
+
+// shift in most significant bit every time, one byte at a time
+// shift register takes most recent bit on right, shifts left
+// poly is written in same order, just & mask message w/ poly
+
+// assume that encoded length is long enough?
+size_t correct_convolutional_encode(correct_convolutional *conv,
+                                    const uint8_t *msg,
+                                    size_t msg_len,
+                                    uint8_t *encoded) {
+    // convolutional code convolves filter coefficients, given by
+    //     the polynomial, with some history from our message.
+    //     the history is stored as single subsequent bits in shiftregister
+    shift_register_t shiftregister = 0;
+
+    // shiftmask is the shiftregister bit mask that removes bits
+    //      that extend beyond order
+    // e.g. if order is 7, then remove the 8th bit and beyond
+    unsigned int shiftmask = (1 << conv->order) - 1;
+
+    size_t encoded_len_bits = correct_convolutional_encode_len(conv, msg_len);
+    size_t encoded_len = (encoded_len_bits % 8) ? (encoded_len_bits / 8 + 1) : (encoded_len_bits / 8);
+    bit_writer_reconfigure(conv->bit_writer, encoded, encoded_len);
+
+    bit_reader_reconfigure(conv->bit_reader, msg, msg_len);
+
+    for (size_t i = 0; i < 8 * msg_len; i++) {
+        // shiftregister has oldest bits on left, newest on right
+        shiftregister <<= 1;
+        shiftregister |= bit_reader_read(conv->bit_reader, 1);
+        shiftregister &= shiftmask;
+        // shift most significant bit from byte and move down one bit at a time
+
+        // we do direct lookup of our convolutional output here
+        // all of the bits from this convolution are stored in this row
+        unsigned int out = conv->table[shiftregister];
+        bit_writer_write(conv->bit_writer, out, conv->rate);
+    }
+
+    // now flush the shiftregister
+    // this is simply running the loop as above but without any new inputs
+    // or rather, the new input string is all 0s
+    for (size_t i = 0; i < conv->order + 1; i++) {
+        shiftregister <<= 1;
+        shiftregister &= shiftmask;
+        unsigned int out = conv->table[shiftregister];
+        bit_writer_write(conv->bit_writer, out, conv->rate);
+    }
+
+    // 0-fill any remaining bits on our final byte
+    bit_writer_flush_byte(conv->bit_writer);
+
+    return encoded_len_bits;
+}
--- a/core/libcorrect/src/convolutional/error_buffer.c
+++ b/core/libcorrect/src/convolutional/error_buffer.c
@@ -0,0 +1,43 @@
+#include "correct/convolutional/error_buffer.h"
+
+error_buffer_t *error_buffer_create(unsigned int num_states) {
+    error_buffer_t *buf = calloc(1, sizeof(error_buffer_t));
+
+    // how large are the error buffers?
+    buf->num_states = num_states;
+
+    // save two error metrics, one for last round and one for this
+    // (double buffer)
+    // the error metric is the aggregated number of bit errors found
+    //   at a given path which terminates at a particular shift register state
+    buf->errors[0] = calloc(sizeof(distance_t), num_states);
+    buf->errors[1] = calloc(sizeof(distance_t), num_states);
+
+    // which buffer are we using, 0 or 1?
+    buf->index = 0;
+
+    buf->read_errors = buf->errors[0];
+    buf->write_errors = buf->errors[1];
+
+    return buf;
+}
+
+void error_buffer_destroy(error_buffer_t *buf) {
+    free(buf->errors[0]);
+    free(buf->errors[1]);
+    free(buf);
+}
+
+void error_buffer_reset(error_buffer_t *buf) {
+    memset(buf->errors[0], 0, buf->num_states * sizeof(distance_t));
+    memset(buf->errors[1], 0, buf->num_states * sizeof(distance_t));
+    buf->index = 0;
+    buf->read_errors = buf->errors[0];
+    buf->write_errors = buf->errors[1];
+}
+
+void error_buffer_swap(error_buffer_t *buf) {
+    buf->read_errors = buf->errors[buf->index];
+    buf->index = (buf->index + 1) % 2;
+    buf->write_errors = buf->errors[buf->index];
+}
--- a/core/libcorrect/src/convolutional/history_buffer.c
+++ b/core/libcorrect/src/convolutional/history_buffer.c
@@ -0,0 +1,158 @@
+#include "correct/convolutional/history_buffer.h"
+
+history_buffer *history_buffer_create(unsigned int min_traceback_length,
+                                      unsigned int traceback_group_length,
+                                      unsigned int renormalize_interval, unsigned int num_states,
+                                      shift_register_t highbit) {
+    history_buffer *buf = calloc(1, sizeof(history_buffer));
+
+    *(unsigned int *)&buf->min_traceback_length = min_traceback_length;
+    *(unsigned int *)&buf->traceback_group_length = traceback_group_length;
+    *(unsigned int *)&buf->cap = min_traceback_length + traceback_group_length;
+    *(unsigned int *)&buf->num_states = num_states;
+    *(shift_register_t *)&buf->highbit = highbit;
+
+    buf->history = malloc(buf->cap * sizeof(uint8_t *));
+    for (unsigned int i = 0; i < buf->cap; i++) {
+        buf->history[i] = calloc(num_states, sizeof(uint8_t));
+    }
+    buf->fetched = malloc(buf->cap * sizeof(uint8_t));
+
+    buf->index = 0;
+    buf->len = 0;
+
+    buf->renormalize_counter = 0;
+    buf->renormalize_interval = renormalize_interval;
+
+    return buf;
+}
+
+void history_buffer_destroy(history_buffer *buf) {
+    for (unsigned int i = 0; i < buf->cap; i++) {
+        free(buf->history[i]);
+    }
+    free(buf->history);
+    free(buf->fetched);
+    free(buf);
+}
+
+void history_buffer_reset(history_buffer *buf) {
+    buf->len = 0;
+    buf->index = 0;
+}
+
+uint8_t *history_buffer_get_slice(history_buffer *buf) { return buf->history[buf->index]; }
+
+shift_register_t history_buffer_search(history_buffer *buf, const distance_t *distances,
+                                       unsigned int search_every) {
+    shift_register_t bestpath;
+    distance_t leasterror = USHRT_MAX;
+    // search for a state with the least error
+    for (shift_register_t state = 0; state < buf->num_states; state += search_every) {
+        if (distances[state] < leasterror) {
+            leasterror = distances[state];
+            bestpath = state;
+        }
+    }
+    return bestpath;
+}
+
+void history_buffer_renormalize(history_buffer *buf, distance_t *distances,
+                                shift_register_t min_register) {
+    distance_t min_distance = distances[min_register];
+    for (shift_register_t i = 0; i < buf->num_states; i++) {
+        distances[i] -= min_distance;
+    }
+}
+
+void history_buffer_traceback(history_buffer *buf, shift_register_t bestpath,
+                              unsigned int min_traceback_length, bit_writer_t *output) {
+    unsigned int fetched_index = 0;
+    shift_register_t highbit = buf->highbit;
+    unsigned int index = buf->index;
+    unsigned int cap = buf->cap;
+    for (unsigned int j = 0; j < min_traceback_length; j++) {
+        if (index == 0) {
+            index = cap - 1;
+        } else {
+            index--;
+        }
+        // we're walking backwards from what the work we did before
+        // so, we'll shift high order bits in
+        // the path will cross multiple different shift register states, and we determine
+        //   which state by going backwards one time slice at a time
+        uint8_t history = buf->history[index][bestpath];
+        shift_register_t pathbit = history ? highbit : 0;
+        bestpath |= pathbit;
+        bestpath >>= 1;
+    }
+    unsigned int prefetch_index = index;
+    if (prefetch_index == 0) {
+        prefetch_index = cap - 1;
+    } else {
+        prefetch_index--;
+    }
+    unsigned int len = buf->len;
+    for (unsigned int j = min_traceback_length; j < len; j++) {
+        index = prefetch_index;
+        if (prefetch_index == 0) {
+            prefetch_index = cap - 1;
+        } else {
+            prefetch_index--;
+        }
+        prefetch(buf->history[prefetch_index]);
+        // we're walking backwards from what the work we did before
+        // so, we'll shift high order bits in
+        // the path will cross multiple different shift register states, and we determine
+        //   which state by going backwards one time slice at a time
+        uint8_t history = buf->history[index][bestpath];
+        shift_register_t pathbit = history ? highbit : 0;
+        bestpath |= pathbit;
+        bestpath >>= 1;
+        buf->fetched[fetched_index] = (pathbit ? 1 : 0);
+        fetched_index++;
+    }
+    bit_writer_write_bitlist_reversed(output, buf->fetched, fetched_index);
+    buf->len -= fetched_index;
+}
+
+void history_buffer_process_skip(history_buffer *buf, distance_t *distances, bit_writer_t *output,
+                                 unsigned int skip) {
+    buf->index++;
+    if (buf->index == buf->cap) {
+        buf->index = 0;
+    }
+
+    buf->renormalize_counter++;
+    buf->len++;
+
+    // there are four ways these branches can resolve
+    // a) we are neither renormalizing nor doing a traceback
+    // b) we are renormalizing but not doing a traceback
+    // c) we are renormalizing and doing a traceback
+    // d) we are not renormalizing but we are doing a traceback
+    // in case c, we want to save the effort of finding the bestpath
+    //    since that's expensive
+    // so we have to check for that case after we renormalize
+    if (buf->renormalize_counter == buf->renormalize_interval) {
+        buf->renormalize_counter = 0;
+        shift_register_t bestpath = history_buffer_search(buf, distances, skip);
+        history_buffer_renormalize(buf, distances, bestpath);
+        if (buf->len == buf->cap) {
+            // reuse the bestpath found for renormalizing
+            history_buffer_traceback(buf, bestpath, buf->min_traceback_length, output);
+        }
+    } else if (buf->len == buf->cap) {
+        // not renormalizing, find the bestpath here
+        shift_register_t bestpath = history_buffer_search(buf, distances, skip);
+        history_buffer_traceback(buf, bestpath, buf->min_traceback_length, output);
+    }
+}
+
+void history_buffer_process(history_buffer *buf, distance_t *distances, bit_writer_t *output) {
+    history_buffer_process_skip(buf, distances, output, 1);
+}
+
+void history_buffer_flush(history_buffer *buf, bit_writer_t *output) {
+    history_buffer_traceback(buf, 0, 0, output);
+}
--- a/core/libcorrect/src/convolutional/lookup.c
+++ b/core/libcorrect/src/convolutional/lookup.c
@@ -0,0 +1,74 @@
+#include "correct/convolutional/lookup.h"
+
+// table has numstates rows
+// each row contains all of the polynomial output bits concatenated together
+// e.g. for rate 2, we have 2 bits in each row
+// the first poly gets the LEAST significant bit, last poly gets most significant
+void fill_table(unsigned int rate,
+                unsigned int order,
+                const polynomial_t *poly,
+                unsigned int *table) {
+    for (shift_register_t i = 0; i < 1 << order; i++) {
+        unsigned int out = 0;
+        unsigned int mask = 1;
+        for (size_t j = 0; j < rate; j++) {
+            out |= (popcount(i & poly[j]) % 2) ? mask : 0;
+            mask <<= 1;
+        }
+        table[i] = out;
+    }
+}
+
+pair_lookup_t pair_lookup_create(unsigned int rate,
+                                 unsigned int order,
+                                 const unsigned int *table) {
+    pair_lookup_t pairs;
+
+    pairs.keys = malloc(sizeof(unsigned int) * (1 << (order - 1)));
+    pairs.outputs = calloc((1 << (rate * 2)), sizeof(unsigned int));
+    unsigned int *inv_outputs = calloc((1 << (rate * 2)), sizeof(unsigned int));
+    unsigned int output_counter = 1;
+    // for every (even-numbered) shift register state, find the concatenated output of the state
+    //   and the subsequent state that follows it (low bit set). then, check to see if this
+    //   concatenated output has a unique key assigned to it already. if not, give it a key.
+    //   if it does, retrieve the key. assign this key to the shift register state.
+    for (unsigned int i = 0; i < (1 << (order - 1)); i++) {
+        // first get the concatenated pair of outputs
+        unsigned int out = table[i * 2 + 1];
+        out <<= rate;
+        out |= table[i * 2];
+
+        // does this concatenated output exist in the outputs table yet?
+        if (!inv_outputs[out]) {
+            // doesn't exist, allocate a new key
+            inv_outputs[out] = output_counter;
+            pairs.outputs[output_counter] = out;
+            output_counter++;
+        }
+        // set the opaque key for the ith shift register state to the concatenated output entry
+        pairs.keys[i] = inv_outputs[out];
+    }
+    pairs.outputs_len = output_counter;
+    pairs.output_mask = (1 << (rate)) - 1;
+    pairs.output_width = rate;
+    pairs.distances = calloc(pairs.outputs_len, sizeof(distance_pair_t));
+    free(inv_outputs);
+    return pairs;
+}
+
+void pair_lookup_destroy(pair_lookup_t pairs) {
+    free(pairs.keys);
+    free(pairs.outputs);
+    free(pairs.distances);
+}
+
+void pair_lookup_fill_distance(pair_lookup_t pairs, distance_t *distances) {
+    for (unsigned int i = 1; i < pairs.outputs_len; i += 1) {
+        output_pair_t concat_out = pairs.outputs[i];
+        unsigned int i_0 = concat_out & pairs.output_mask;
+        concat_out >>= pairs.output_width;
+        unsigned int i_1 = concat_out;
+
+        pairs.distances[i] = (distances[i_1] << 16) | distances[i_0];
+    }
+}
--- a/core/libcorrect/src/convolutional/metric.c
+++ b/core/libcorrect/src/convolutional/metric.c
@@ -0,0 +1,17 @@
+#include "correct/convolutional/metric.h"
+
+// measure the square of the euclidean distance between x and y
+// since euclidean dist is sqrt(a^2 + b^2 + ... + n^2), the square is just
+//    a^2 + b^2 + ... + n^2
+distance_t metric_soft_distance_quadratic(unsigned int hard_x, const uint8_t *soft_y, size_t len) {
+    distance_t dist = 0;
+    for (unsigned int i = 0; i < len; i++) {
+        // first, convert hard_x to a soft measurement (0 -> 0, 1 - > 255)
+        unsigned int soft_x = (hard_x & 1) ? 255 : 0;
+        hard_x >>= 1;
+        int d = soft_y[i] - soft_x;
+        dist += d*d;
+    }
+    return dist >> 3;
+}
+
--- a/core/libcorrect/src/convolutional/sse/CMakeLists.txt
+++ b/core/libcorrect/src/convolutional/sse/CMakeLists.txt
@@ -0,0 +1,2 @@
+set(SRCFILES lookup.c convolutional.c encode.c decode.c)
+add_library(correct-convolutional-sse OBJECT ${SRCFILES})
--- a/core/libcorrect/src/convolutional/sse/convolutional.c
+++ b/core/libcorrect/src/convolutional/sse/convolutional.c
@@ -0,0 +1,21 @@
+#include "correct/convolutional/sse/convolutional.h"
+
+correct_convolutional_sse *correct_convolutional_sse_create(size_t rate,
+                                                            size_t order,
+                                                            const polynomial_t *poly) {
+    correct_convolutional_sse *conv = malloc(sizeof(correct_convolutional_sse));
+    correct_convolutional *init_conv = _correct_convolutional_init(&conv->base_conv, rate, order, poly);
+    if (!init_conv) {
+        free(conv);
+        conv = NULL;
+    }
+    return conv;
+}
+
+void correct_convolutional_sse_destroy(correct_convolutional_sse *conv) {
+    if (conv->base_conv.has_init_decode) {
+        oct_lookup_destroy(conv->oct_lookup);
+    }
+    _correct_convolutional_teardown(&conv->base_conv);
+    free(conv);
+}
--- a/core/libcorrect/src/convolutional/sse/decode.c
+++ b/core/libcorrect/src/convolutional/sse/decode.c
@@ -0,0 +1,319 @@
+#include "correct/convolutional/sse/convolutional.h"
+
+static void convolutional_sse_decode_inner(correct_convolutional_sse *sse_conv, unsigned int sets,
+                                           const uint8_t *soft) {
+    correct_convolutional *conv = &sse_conv->base_conv;
+    shift_register_t highbit = 1 << (conv->order - 1);
+    unsigned int hist_buf_index = conv->history_buffer->index;
+    unsigned int hist_buf_cap = conv->history_buffer->cap;
+    unsigned int hist_buf_len = conv->history_buffer->len;
+    unsigned int hist_buf_rn_int = conv->history_buffer->renormalize_interval;
+    unsigned int hist_buf_rn_cnt = conv->history_buffer->renormalize_counter;
+    for (unsigned int i = conv->order - 1; i < (sets - conv->order + 1); i++) {
+        distance_t *distances = conv->distances;
+        // lasterrors are the aggregate bit errors for the states of
+        // shiftregister for the previous time slice
+        if (soft) {
+            if (conv->soft_measurement == CORRECT_SOFT_LINEAR) {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_linear(j, soft + i * conv->rate, conv->rate);
+                }
+            } else {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_quadratic(j, soft + i * conv->rate, conv->rate);
+                }
+            }
+        } else {
+            unsigned int out = bit_reader_read(conv->bit_reader, conv->rate);
+            for (unsigned int i = 0; i < 1 << (conv->rate); i++) {
+                distances[i] = metric_distance(i, out);
+            }
+        }
+        oct_lookup_t oct_lookup = sse_conv->oct_lookup;
+        oct_lookup_fill_distance(oct_lookup, distances);
+
+        // a mask to get the high order bit from the shift register
+        unsigned int num_iter = highbit << 1;
+        const distance_t *read_errors = conv->errors->read_errors;
+        // aggregate bit errors for this time slice
+        distance_t *write_errors = conv->errors->write_errors;
+
+        uint8_t *history = conv->history_buffer->history[hist_buf_index];
+        ;
+        // walk through all states, ignoring oldest bit
+        // we will track a best register state (path) and the number of bit
+        // errors at that path at this time slice
+        // this loop considers two paths per iteration (high order bit set,
+        // clear)
+        // so, it only runs numstates/2 iterations
+        // we'll update the history for every state and find the path with the
+        // least aggregated bit errors
+
+        // now run the main loop
+        // we calculate 2 sets of 2 register states here (4 states per iter)
+        // this creates 2 sets which share a predecessor, and 2 sets which share
+        // a successor
+        //
+        // the first set definition is the two states that are the same except
+        // for the least order bit
+        // these two share a predecessor because their high n - 1 bits are the
+        // same (differ only by newest bit)
+        //
+        // the second set definition is the two states that are the same except
+        // for the high order bit
+        // these two share a successor because the oldest high order bit will be
+        // shifted out, and the other bits will be present in the successor
+        //
+        shift_register_t highbase = highbit >> 1;
+        shift_register_t oct_highbase = highbase >> 2;
+        for (shift_register_t low = 0, high = highbit, base = 0, oct = 0; high < num_iter;
+             low += 32, high += 32, base += 16, oct += 4) {
+            // shifted-right ancestors
+            // low and low_plus_one share low_past_error
+            //   note that they are the same when shifted right by 1
+            // same goes for high and high_plus_one
+            __m128i past_shuffle_mask =
+                _mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100);
+            __m128i hist_mask =
+                _mm_set_epi32(0x80808080, 0x80808080, 0x0e0c0a09, 0x07050301);
+
+            // the loop below calculates 64 register states per loop iteration
+            // it does this by packing the 128-bit xmm registers with 8, 16-bit
+            // distances
+            // 4 of these registers hold distances for convolutional shift
+            // register states with the high bit cleared
+            //      and 4 hold distances for the corresponding shift register
+            //      states with the high bit set
+            // since each xmm register holds 8 distances, this adds up to a
+            // total of 8 * 8 = 64 shift register states
+            for (shift_register_t offset = 0, base_offset = 0; base_offset < 16;
+                 offset += 32, base_offset += 16) {
+                // load the past error for the register states with the high
+                // order bit cleared
+                __m128i low_past_error =
+                    _mm_loadl_epi64((const __m128i *)(read_errors + base + base_offset));
+                __m128i low_past_error0 =
+                    _mm_loadl_epi64((const __m128i *)(read_errors + base + base_offset + 4));
+                __m128i low_past_error1 =
+                    _mm_loadl_epi64((const __m128i *)(read_errors + base + base_offset + 8));
+                __m128i low_past_error2 =
+                    _mm_loadl_epi64((const __m128i *)(read_errors + base + base_offset + 12));
+
+                // shuffle the low past error
+                // register states that differ only by their low order bit share
+                // a past error
+                low_past_error = _mm_shuffle_epi8(low_past_error, past_shuffle_mask);
+                low_past_error0 = _mm_shuffle_epi8(low_past_error0, past_shuffle_mask);
+                low_past_error1 = _mm_shuffle_epi8(low_past_error1, past_shuffle_mask);
+                low_past_error2 = _mm_shuffle_epi8(low_past_error2, past_shuffle_mask);
+
+                // repeat past error lookup for register states with high order
+                // bit set
+                __m128i high_past_error =
+                    _mm_loadl_epi64((const __m128i *)(read_errors + highbase + base + base_offset));
+                __m128i high_past_error0 = _mm_loadl_epi64(
+                    (const __m128i *)(read_errors + highbase + base + base_offset + 4));
+                __m128i high_past_error1 = _mm_loadl_epi64(
+                    (const __m128i *)(read_errors + highbase + base + base_offset + 8));
+                __m128i high_past_error2 = _mm_loadl_epi64(
+                    (const __m128i *)(read_errors + highbase + base + base_offset + 12));
+
+                high_past_error = _mm_shuffle_epi8(high_past_error, past_shuffle_mask);
+                high_past_error0 = _mm_shuffle_epi8(high_past_error0, past_shuffle_mask);
+                high_past_error1 = _mm_shuffle_epi8(high_past_error1, past_shuffle_mask);
+                high_past_error2 = _mm_shuffle_epi8(high_past_error2, past_shuffle_mask);
+
+                // __m128i this_shuffle_mask = (__m128i){0x80800100, 0x80800302,
+                // 0x80800504, 0x80800706};
+
+                // load the opaque oct distance table keys from out loop index
+                distance_oct_key_t low_key = oct_lookup.keys[oct + (base_offset / 4)];
+                distance_oct_key_t low_key0 = oct_lookup.keys[oct + (base_offset / 4) + 1];
+                distance_oct_key_t low_key1 = oct_lookup.keys[oct + (base_offset / 4) + 2];
+                distance_oct_key_t low_key2 = oct_lookup.keys[oct + (base_offset / 4) + 3];
+
+                // load the distances for the register states with high order
+                // bit cleared
+                __m128i low_this_error =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + low_key));
+                __m128i low_this_error0 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + low_key0));
+                __m128i low_this_error1 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + low_key1));
+                __m128i low_this_error2 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + low_key2));
+
+                // add the distance for this time slice to the past distances
+                __m128i low_error = _mm_add_epi16(low_past_error, low_this_error);
+                __m128i low_error0 = _mm_add_epi16(low_past_error0, low_this_error0);
+                __m128i low_error1 = _mm_add_epi16(low_past_error1, low_this_error1);
+                __m128i low_error2 = _mm_add_epi16(low_past_error2, low_this_error2);
+
+                // repeat oct distance table lookup for registers with high
+                // order bit set
+                distance_oct_key_t high_key =
+                    oct_lookup.keys[oct_highbase + oct + (base_offset / 4)];
+                distance_oct_key_t high_key0 =
+                    oct_lookup.keys[oct_highbase + oct + (base_offset / 4) + 1];
+                distance_oct_key_t high_key1 =
+                    oct_lookup.keys[oct_highbase + oct + (base_offset / 4) + 2];
+                distance_oct_key_t high_key2 =
+                    oct_lookup.keys[oct_highbase + oct + (base_offset / 4) + 3];
+
+                __m128i high_this_error =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + high_key));
+                __m128i high_this_error0 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + high_key0));
+                __m128i high_this_error1 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + high_key1));
+                __m128i high_this_error2 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + high_key2));
+
+                __m128i high_error = _mm_add_epi16(high_past_error, high_this_error);
+                __m128i high_error0 = _mm_add_epi16(high_past_error0, high_this_error0);
+                __m128i high_error1 = _mm_add_epi16(high_past_error1, high_this_error1);
+                __m128i high_error2 = _mm_add_epi16(high_past_error2, high_this_error2);
+
+                // distances for this time slice calculated
+
+                // find the least error between registers who differ only in
+                // their high order bit
+                __m128i min_error = _mm_min_epu16(low_error, high_error);
+                __m128i min_error0 = _mm_min_epu16(low_error0, high_error0);
+                __m128i min_error1 = _mm_min_epu16(low_error1, high_error1);
+                __m128i min_error2 = _mm_min_epu16(low_error2, high_error2);
+
+                _mm_store_si128((__m128i *)(write_errors + low + offset), min_error);
+                _mm_store_si128((__m128i *)(write_errors + low + offset + 8), min_error0);
+                _mm_store_si128((__m128i *)(write_errors + low + offset + 16), min_error1);
+                _mm_store_si128((__m128i *)(write_errors + low + offset + 24), min_error2);
+
+                // generate history bits as (low_error > least_error)
+                // this operation fills each element with all 1s if true and 0s
+                // if false
+                // in other words, we set the history bit to 1 if
+                //      the register state with high order bit set was the least
+                //      error
+                __m128i hist = _mm_cmpgt_epi16(low_error, min_error);
+                // pack the bits down from 16-bit wide to 8-bit wide to
+                // accomodate history table
+                hist = _mm_shuffle_epi8(hist, hist_mask);
+
+                __m128i hist0 = _mm_cmpgt_epi16(low_error0, min_error0);
+                hist0 = _mm_shuffle_epi8(hist0, hist_mask);
+
+                __m128i hist1 = _mm_cmpgt_epi16(low_error1, min_error1);
+                hist1 = _mm_shuffle_epi8(hist1, hist_mask);
+
+                __m128i hist2 = _mm_cmpgt_epi16(low_error2, min_error2);
+                hist2 = _mm_shuffle_epi8(hist2, hist_mask);
+
+                // write the least error so that the next time slice sees it as
+                // the past error
+                // store the history bits set by cmp and shuffle operations
+                _mm_storel_epi64((__m128i *)(history + low + offset), hist);
+                _mm_storel_epi64((__m128i *)(history + low + offset + 8), hist0);
+                _mm_storel_epi64((__m128i *)(history + low + offset + 16), hist1);
+                _mm_storel_epi64((__m128i *)(history + low + offset + 24), hist2);
+            }
+        }
+
+        // bypass the call to history buffer
+        // we should really make that function inline and remove this below
+        if (hist_buf_len == hist_buf_cap - 1 || hist_buf_rn_cnt == hist_buf_rn_int - 1) {
+            // restore hist buffer state and invoke it
+            conv->history_buffer->len = hist_buf_len;
+            conv->history_buffer->index = hist_buf_index;
+            conv->history_buffer->renormalize_counter = hist_buf_rn_cnt;
+            history_buffer_process(conv->history_buffer, write_errors, conv->bit_writer);
+            // restore our local values
+            hist_buf_len = conv->history_buffer->len;
+            hist_buf_index = conv->history_buffer->index;
+            hist_buf_cap = conv->history_buffer->cap;
+            hist_buf_rn_cnt = conv->history_buffer->renormalize_counter;
+        } else {
+            hist_buf_len++;
+            hist_buf_index++;
+            if (hist_buf_index == hist_buf_cap) {
+                hist_buf_index = 0;
+            }
+            hist_buf_rn_cnt++;
+        }
+        error_buffer_swap(conv->errors);
+    }
+    conv->history_buffer->len = hist_buf_len;
+    conv->history_buffer->index = hist_buf_index;
+    conv->history_buffer->renormalize_counter = hist_buf_rn_cnt;
+}
+
+static void _convolutional_sse_decode_init(correct_convolutional_sse *conv,
+                                           unsigned int min_traceback,
+                                           unsigned int traceback_length,
+                                           unsigned int renormalize_interval) {
+    _convolutional_decode_init(&conv->base_conv, min_traceback, traceback_length,
+                               renormalize_interval);
+    conv->oct_lookup =
+        oct_lookup_create(conv->base_conv.rate, conv->base_conv.order, conv->base_conv.table);
+}
+
+static ssize_t _convolutional_sse_decode(correct_convolutional_sse *sse_conv,
+                                         size_t num_encoded_bits, size_t num_encoded_bytes,
+                                         uint8_t *msg, const soft_t *soft_encoded) {
+    correct_convolutional *conv = &sse_conv->base_conv;
+    if (!conv->has_init_decode) {
+        uint64_t max_error_per_input = conv->rate * soft_max;
+        // sse implementation unfortunately uses signed math on our unsigned values
+        // reduces usable distance by /2
+        unsigned int renormalize_interval = (distance_max / 2) / max_error_per_input;
+        _convolutional_sse_decode_init(sse_conv, 5 * conv->order, 100 * conv->order,
+                                       renormalize_interval);
+    }
+
+    size_t sets = num_encoded_bits / conv->rate;
+    // XXX fix this vvvvvv
+    size_t decoded_len_bytes = num_encoded_bytes;
+    bit_writer_reconfigure(conv->bit_writer, msg, decoded_len_bytes);
+
+    error_buffer_reset(conv->errors);
+    history_buffer_reset(conv->history_buffer);
+
+    // no outputs are generated during warmup
+    convolutional_decode_warmup(conv, sets, soft_encoded);
+    convolutional_sse_decode_inner(sse_conv, sets, soft_encoded);
+    convolutional_decode_tail(conv, sets, soft_encoded);
+
+    history_buffer_flush(conv->history_buffer, conv->bit_writer);
+
+    return bit_writer_length(conv->bit_writer);
+}
+
+ssize_t correct_convolutional_sse_decode(correct_convolutional_sse *conv, const uint8_t *encoded,
+                                         size_t num_encoded_bits, uint8_t *msg) {
+    if (num_encoded_bits % conv->base_conv.rate) {
+        // XXX turn this into an error code
+        // printf("encoded length of message must be a multiple of rate\n");
+        return -1;
+    }
+
+    size_t num_encoded_bytes =
+        (num_encoded_bits % 8) ? (num_encoded_bits / 8 + 1) : (num_encoded_bits / 8);
+    bit_reader_reconfigure(conv->base_conv.bit_reader, encoded, num_encoded_bytes);
+
+    return _convolutional_sse_decode(conv, num_encoded_bits, num_encoded_bytes, msg, NULL);
+}
+
+ssize_t correct_convolutional_sse_decode_soft(correct_convolutional_sse *conv, const soft_t *encoded,
+                                              size_t num_encoded_bits, uint8_t *msg) {
+    if (num_encoded_bits % conv->base_conv.rate) {
+        // XXX turn this into an error code
+        // printf("encoded length of message must be a multiple of rate\n");
+        return -1;
+    }
+
+    size_t num_encoded_bytes =
+        (num_encoded_bits % 8) ? (num_encoded_bits / 8 + 1) : (num_encoded_bits / 8);
+
+    return _convolutional_sse_decode(conv, num_encoded_bits, num_encoded_bytes, msg, encoded);
+}
--- a/core/libcorrect/src/convolutional/sse/encode.c
+++ b/core/libcorrect/src/convolutional/sse/encode.c
@@ -0,0 +1,9 @@
+#include "correct/convolutional/sse/convolutional.h"
+
+size_t correct_convolutional_sse_encode_len(correct_convolutional_sse *conv, size_t msg_len) {
+    return correct_convolutional_encode_len(&conv->base_conv, msg_len);
+}
+
+size_t correct_convolutional_sse_encode(correct_convolutional_sse *conv, const uint8_t *msg, size_t msg_len, uint8_t *encoded) {
+    return correct_convolutional_encode(&conv->base_conv, msg, msg_len, encoded);
+}
--- a/core/libcorrect/src/convolutional/sse/lookup.c
+++ b/core/libcorrect/src/convolutional/sse/lookup.c
@@ -0,0 +1,183 @@
+#include "correct/convolutional/sse/lookup.h"
+
+quad_lookup_t quad_lookup_create(unsigned int rate,
+                                 unsigned int order,
+                                 const unsigned int *table) {
+    quad_lookup_t quads;
+
+    quads.keys = malloc(sizeof(unsigned int) * (1 << (order - 2)));
+    quads.outputs = calloc((1 << (rate * 4)), sizeof(unsigned int));
+    unsigned int *inv_outputs = calloc((1 << (rate * 4)), sizeof(unsigned int));
+    unsigned int output_counter = 1;
+    // for every (even-numbered) shift register state, find the concatenated output of the state
+    //   and the subsequent state that follows it (low bit set). then, check to see if this
+    //   concatenated output has a unique key assigned to it already. if not, give it a key.
+    //   if it does, retrieve the key. assign this key to the shift register state.
+    for (unsigned int i = 0; i < (1 << (order - 2)); i++) {
+        // first get the concatenated quad of outputs
+        unsigned int out = table[i * 4 + 3];
+        out <<= rate;
+        out |= table[i * 4 + 2];
+        out <<= rate;
+        out |= table[i * 4 + 1];
+        out <<= rate;
+        out |= table[i * 4];
+
+        // does this concatenated output exist in the outputs table yet?
+        if (!inv_outputs[out]) {
+            // doesn't exist, allocate a new key
+            inv_outputs[out] = output_counter;
+            quads.outputs[output_counter] = out;
+            output_counter++;
+        }
+        // set the opaque key for the ith shift register state to the concatenated output entry
+        quads.keys[i] = inv_outputs[out];
+    }
+    quads.outputs_len = output_counter;
+    quads.output_mask = (1 << (rate)) - 1;
+    quads.output_width = rate;
+    quads.distances = calloc(quads.outputs_len, sizeof(distance_quad_t));
+    free(inv_outputs);
+    return quads;
+}
+
+void quad_lookup_destroy(quad_lookup_t quads) {
+    free(quads.keys);
+    free(quads.outputs);
+    free(quads.distances);
+}
+
+void quad_lookup_fill_distance(quad_lookup_t quads, distance_t *distances) {
+    for (unsigned int i = 1; i < quads.outputs_len; i += 1) {
+        output_quad_t concat_out = quads.outputs[i];
+        unsigned int i_0 = concat_out & quads.output_mask;
+        concat_out >>= quads.output_width;
+        unsigned int i_1 = concat_out & quads.output_mask;
+        concat_out >>= quads.output_width;
+        unsigned int i_2 = concat_out & quads.output_mask;
+        concat_out >>= quads.output_width;
+        unsigned int i_3 = concat_out;
+
+        quads.distances[i] = ((uint64_t)distances[i_3] << 48) | ((uint64_t)distances[i_2] << 32) | (distances[i_1] << 16) | distances[i_0];
+    }
+}
+
+distance_oct_key_t oct_lookup_find_key(output_oct_t *outputs, output_oct_t out, size_t num_keys) {
+    for (size_t i = 1; i < num_keys; i++) {
+        if (outputs[i] == out) {
+            return i;
+        }
+    }
+    return 0;
+}
+
+oct_lookup_t oct_lookup_create(unsigned int rate,
+                                 unsigned int order,
+                                 const unsigned int *table) {
+    oct_lookup_t octs;
+
+    octs.keys = malloc((1 << (order - 3)) * sizeof(distance_oct_key_t));
+    octs.outputs = malloc(((output_oct_t)2 << rate) * sizeof(uint64_t));
+    output_oct_t *short_outs = calloc(((output_oct_t)2 << rate), sizeof(output_oct_t));
+    size_t outputs_len = 2 << rate;
+    unsigned int output_counter = 1;
+    // for every (even-numbered) shift register state, find the concatenated output of the state
+    //   and the subsequent state that follows it (low bit set). then, check to see if this
+    //   concatenated output has a unique key assigned to it already. if not, give it a key.
+    //   if it does, retrieve the key. assign this key to the shift register state.
+    for (shift_register_t i = 0; i < (1 << (order - 3)); i++) {
+        // first get the concatenated oct of outputs
+        output_oct_t out = table[i * 8 + 7];
+        out <<= rate;
+        out |= table[i * 8 + 6];
+        out <<= rate;
+        out |= table[i * 8 + 5];
+        out <<= rate;
+        out |= table[i * 8 + 4];
+        out <<= rate;
+        out |= table[i * 8 + 3];
+        out <<= rate;
+        out |= table[i * 8 + 2];
+        out <<= rate;
+        out |= table[i * 8 + 1];
+        out <<= rate;
+        out |= table[i * 8];
+
+        distance_oct_key_t key = oct_lookup_find_key(short_outs, out, output_counter);
+        // does this concatenated output exist in the outputs table yet?
+        if (!key) {
+            // doesn't exist, allocate a new key
+            // now build it in expanded form
+            output_oct_t expanded_out = table[i * 8 + 7];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 6];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 5];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 4];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 3];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 2];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 1];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8];
+
+            if (output_counter == outputs_len) {
+                octs.outputs = realloc(octs.outputs, outputs_len * 2 * sizeof(output_oct_t));
+                short_outs = realloc(short_outs, outputs_len * 2 * sizeof(output_oct_t));
+                outputs_len *= 2;
+            }
+            short_outs[output_counter] = out;
+            octs.outputs[output_counter] = expanded_out;
+            key = output_counter;
+            output_counter++;
+        }
+        // set the opaque key for the ith shift register state to the concatenated output entry
+        // we multiply the key by 2 since the distances are strided by 2
+        octs.keys[i] = key * 2;
+    }
+    free(short_outs);
+    octs.outputs_len = output_counter;
+    octs.output_mask = (1 << (rate)) - 1;
+    octs.output_width = rate;
+    octs.distances = malloc(octs.outputs_len * 2 * sizeof(uint64_t));
+    return octs;
+}
+
+void oct_lookup_destroy(oct_lookup_t octs) {
+    free(octs.keys);
+    free(octs.outputs);
+    free(octs.distances);
+}
+
+// WIP: sse approach to filling the distance table
+/*
+void oct_lookup_fill_distance_sse(oct_lookup_t octs, distance_t *distances) {
+    distance_pair_t *distance_pair = (distance_pair_t*)octs.distances;
+    __v4si index_shuffle_mask = (__v4si){0xffffff00, 0xffffff01, 0xffffff02, 0xffffff03};
+    __m256i dist_shuffle_mask = (__m256i){0x01000504, 0x09080d0c, 0xffffffff, 0xffffffff,
+                                          0x01000504, 0x09080d0c, 0xffffffff, 0xffffffff};
+    const int dist_permute_mask = 0x0c;
+    for (unsigned int i = 1; i < octs.outputs_len; i += 2) {
+        // big heaping todo vvv
+        // a) we want 16 bit distances GATHERed, not 32 bit
+        // b) we need to load 8 of those distances, not 4
+        __v4si short_concat_index = _mm_loadl_epi64(octs.outputs + 2*i);
+        __v4si short_concat_index0 = _mm_loadl_epi64(octs.outputs + 2*i + 1);
+        __m256i concat_index = _mm256_cvtepu8_epi32(short_concat_index);
+        __m256i concat_index0 = _mm256_cvtepu8_epi32(short_concat_index0);
+        __m256i dist = _mm256_i32gather_epi32(distances, concat_index, sizeof(distance_t));
+        __m256i dist0 = _mm256_i32gather_epi32(distances, concat_index0, sizeof(distance_t));
+        dist = _mm256_shuffle_epi8(dist, dist_shuffle_mask);
+        dist0 = _mm256_shuffle_epi8(dist0, dist_shuffle_mask);
+        dist = __builtin_shufflevector(dist, dist, 0, 5, 0, 0);
+        dist0 = __builtin_shufflevector(dist0, dist0, 0, 5, 0, 0);
+        __v4si packed_dist = _mm256_castsi256_si128(dist);
+        _mm_store_si128(distance_pair + 8 * i, packed_dist);
+        __v4si packed_dist0 = _mm256_castsi256_si128(dist0);
+        _mm_store_si128(distance_pair + 8 * i + 4, packed_dist0);
+    }
+}
+*/
--- a/core/libcorrect/src/fec_shim.c
+++ b/core/libcorrect/src/fec_shim.c
@@ -0,0 +1,255 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include "fec_shim.h"
+
+typedef struct {
+    correct_reed_solomon *rs;
+    unsigned int msg_length;
+    unsigned int block_length;
+    unsigned int num_roots;
+    uint8_t *msg_out;
+    unsigned int pad;
+    uint8_t *erasures;
+} reed_solomon_shim;
+
+void *init_rs_char(int symbol_size, int primitive_polynomial,
+                   int first_consecutive_root, int root_gap, int number_roots,
+                   unsigned int pad) {
+    if (symbol_size != 8) {
+        return NULL;
+    }
+
+    reed_solomon_shim *shim = malloc(sizeof(reed_solomon_shim));
+
+    shim->pad = pad;
+    shim->block_length = 255 - pad;
+    shim->num_roots = number_roots;
+    shim->msg_length = shim->block_length - number_roots;
+    shim->rs = correct_reed_solomon_create(primitive_polynomial,
+                                           first_consecutive_root, root_gap, number_roots);
+    shim->msg_out = malloc(shim->block_length);
+    shim->erasures = malloc(number_roots);
+
+    return shim;
+}
+
+void free_rs_char(void *rs) {
+    reed_solomon_shim *shim = (reed_solomon_shim *)rs;
+    correct_reed_solomon_destroy(shim->rs);
+    free(shim->msg_out);
+    free(shim->erasures);
+    free(shim);
+}
+
+void encode_rs_char(void *rs, const unsigned char *msg, unsigned char *parity) {
+    reed_solomon_shim *shim = (reed_solomon_shim *)rs;
+    correct_reed_solomon_encode(shim->rs, msg, shim->msg_length, shim->msg_out);
+    memcpy(parity, shim->msg_out + shim->msg_length, shim->num_roots);
+}
+
+void decode_rs_char(void *rs, unsigned char *block, int *erasure_locations,
+                    int num_erasures) {
+    reed_solomon_shim *shim = (reed_solomon_shim *)rs;
+    for (int i = 0; i < num_erasures; i++) {
+        shim->erasures[i] = (uint8_t)(erasure_locations[i]) - shim->pad;
+    }
+    correct_reed_solomon_decode_with_erasures(shim->rs, block, shim->block_length,
+                                              shim->erasures, num_erasures,
+                                              block);
+}
+
+typedef struct {
+    correct_convolutional *conv;
+    unsigned int rate;
+    unsigned int order;
+    uint8_t *buf;
+    size_t buf_len;
+    uint8_t *read_iter;
+    uint8_t *write_iter;
+} convolutional_shim;
+
+static correct_convolutional_polynomial_t r12k7[] = {V27POLYA, V27POLYB};
+
+static correct_convolutional_polynomial_t r12k9[] = {V29POLYA, V29POLYB};
+
+static correct_convolutional_polynomial_t r13k9[] = {V39POLYA, V39POLYB,
+                                                     V39POLYC};
+
+static correct_convolutional_polynomial_t r16k15[] = {
+    V615POLYA, V615POLYB, V615POLYC, V615POLYD, V615POLYE, V615POLYF};
+
+/* Common methods */
+static void *create_viterbi(unsigned int num_decoded_bits, unsigned int rate,
+                            unsigned int order,
+                            correct_convolutional_polynomial_t *poly) {
+    convolutional_shim *shim = malloc(sizeof(convolutional_shim));
+
+    size_t num_decoded_bytes = (num_decoded_bits % 8)
+                                   ? (num_decoded_bits / 8 + 1)
+                                   : num_decoded_bits / 8;
+
+    shim->rate = rate;
+    shim->order = order;
+    shim->buf = malloc(num_decoded_bytes);
+    shim->buf_len = num_decoded_bytes;
+    shim->conv = correct_convolutional_create(rate, order, poly);
+    shim->read_iter = shim->buf;
+    shim->write_iter = shim->buf;
+
+    return shim;
+}
+
+static void delete_viterbi(void *vit) {
+    convolutional_shim *shim = (convolutional_shim *)vit;
+    free(shim->buf);
+    correct_convolutional_destroy(shim->conv);
+    free(shim);
+}
+
+static void init_viterbi(void *vit) {
+    convolutional_shim *shim = (convolutional_shim *)vit;
+    shim->read_iter = shim->buf;
+    shim->write_iter = shim->buf;
+}
+
+static void update_viterbi_blk(void *vit, const unsigned char *encoded_soft,
+                               unsigned int num_encoded_groups) {
+    convolutional_shim *shim = (convolutional_shim *)vit;
+
+    // don't overwrite our buffer
+    size_t rem = (shim->buf + shim->buf_len) - shim->write_iter;
+    size_t rem_bits = 8 * rem;
+    // this math isn't very clear
+    // here we sort of do the opposite of what liquid-dsp does
+    size_t n_write_bits = num_encoded_groups - (shim->order - 1);
+    if (n_write_bits > rem_bits) {
+        size_t reduction = n_write_bits - rem_bits;
+        num_encoded_groups -= reduction;
+        n_write_bits -= reduction;
+    }
+
+    // what if n_write_bits isn't a multiple of 8?
+    // libcorrect can't start and stop at arbitrary indices...
+    correct_convolutional_decode_soft(
+        shim->conv, encoded_soft, num_encoded_groups * shim->rate, shim->write_iter);
+    shim->write_iter += n_write_bits / 8;
+}
+
+static void chainback_viterbi(void *vit, unsigned char *decoded,
+                              unsigned int num_decoded_bits) {
+    convolutional_shim *shim = (convolutional_shim *)vit;
+
+    // num_decoded_bits not a multiple of 8?
+    // this is a similar problem to update_viterbi_blk
+    // although here we could actually resolve a non-multiple of 8
+    size_t rem = shim->write_iter - shim->read_iter;
+    size_t rem_bits = 8 * rem;
+
+    if (num_decoded_bits > rem_bits) {
+        num_decoded_bits = rem_bits;
+    }
+
+    size_t num_decoded_bytes = (num_decoded_bits % 8)
+                                   ? (num_decoded_bits / 8 + 1)
+                                   : num_decoded_bits / 8;
+    memcpy(decoded, shim->read_iter, num_decoded_bytes);
+
+    shim->read_iter += num_decoded_bytes;
+}
+
+/* Rate 1/2, k = 7 */
+void *create_viterbi27(int num_decoded_bits) {
+    return create_viterbi(num_decoded_bits, 2, 7, r12k7);
+}
+
+void delete_viterbi27(void *vit) { delete_viterbi(vit); }
+
+int init_viterbi27(void *vit, int _) {
+    init_viterbi(vit);
+    return 0;
+}
+
+int update_viterbi27_blk(void *vit, unsigned char *encoded_soft,
+                         int num_encoded_groups) {
+    update_viterbi_blk(vit, encoded_soft, num_encoded_groups);
+    return 0;
+}
+
+int chainback_viterbi27(void *vit, unsigned char *decoded,
+                        unsigned int num_decoded_bits, unsigned int _) {
+    chainback_viterbi(vit, decoded, num_decoded_bits);
+    return 0;
+}
+
+/* Rate 1/2, k = 9 */
+void *create_viterbi29(int num_decoded_bits) {
+    return create_viterbi(num_decoded_bits, 2, 9, r12k9);
+}
+
+void delete_viterbi29(void *vit) { delete_viterbi(vit); }
+
+int init_viterbi29(void *vit, int _) {
+    init_viterbi(vit);
+    return 0;
+}
+
+int update_viterbi29_blk(void *vit, unsigned char *encoded_soft,
+                         int num_encoded_groups) {
+    update_viterbi_blk(vit, encoded_soft, num_encoded_groups);
+    return 0;
+}
+
+int chainback_viterbi29(void *vit, unsigned char *decoded,
+                        unsigned int num_decoded_bits, unsigned int _) {
+    chainback_viterbi(vit, decoded, num_decoded_bits);
+    return 0;
+}
+
+/* Rate 1/3, k = 9 */
+void *create_viterbi39(int num_decoded_bits) {
+    return create_viterbi(num_decoded_bits, 3, 9, r13k9);
+}
+
+void delete_viterbi39(void *vit) { delete_viterbi(vit); }
+
+int init_viterbi39(void *vit, int _) {
+    init_viterbi(vit);
+    return 0;
+}
+
+int update_viterbi39_blk(void *vit, unsigned char *encoded_soft,
+                         int num_encoded_groups) {
+    update_viterbi_blk(vit, encoded_soft, num_encoded_groups);
+    return 0;
+}
+
+int chainback_viterbi39(void *vit, unsigned char *decoded,
+                        unsigned int num_decoded_bits, unsigned int _) {
+    chainback_viterbi(vit, decoded, num_decoded_bits);
+    return 0;
+}
+
+/* Rate 1/6, k = 15 */
+void *create_viterbi615(int num_decoded_bits) {
+    return create_viterbi(num_decoded_bits, 6, 15, r16k15);
+}
+
+void delete_viterbi615(void *vit) { delete_viterbi(vit); }
+
+int init_viterbi615(void *vit, int _) {
+    init_viterbi(vit);
+    return 0;
+}
+
+int update_viterbi615_blk(void *vit, unsigned char *encoded_soft,
+                          int num_encoded_groups) {
+    update_viterbi_blk(vit, encoded_soft, num_encoded_groups);
+    return 0;
+}
+
+int chainback_viterbi615(void *vit, unsigned char *decoded,
+                         unsigned int num_decoded_bits, unsigned int _) {
+    chainback_viterbi(vit, decoded, num_decoded_bits);
+    return 0;
+}
--- a/core/libcorrect/src/reed-solomon/CMakeLists.txt
+++ b/core/libcorrect/src/reed-solomon/CMakeLists.txt
@@ -0,0 +1,2 @@
+set(SRCFILES polynomial.c reed-solomon.c encode.c decode.c)
+add_library(correct-reed-solomon OBJECT ${SRCFILES})
--- a/core/libcorrect/src/reed-solomon/decode.c
+++ b/core/libcorrect/src/reed-solomon/decode.c
@@ -0,0 +1,508 @@
+#include "correct/reed-solomon/encode.h"
+
+// calculate all syndromes of the received polynomial at the roots of the generator
+// because we're evaluating at the roots of the generator, and because the transmitted
+//   polynomial was made to be a product of the generator, we know that the transmitted
+//   polynomial is 0 at these roots
+// any nonzero syndromes we find here are the values of the error polynomial evaluated
+//   at these roots, so these values give us a window into the error polynomial. if
+//   these syndromes are all zero, then we can conclude the error polynomial is also
+//   zero. if they're nonzero, then we know our message received an error in transit.
+// returns true if syndromes are all zero
+static bool reed_solomon_find_syndromes(field_t field, polynomial_t msgpoly, field_logarithm_t **generator_root_exp,
+                                        field_element_t *syndromes, size_t min_distance) {
+    bool all_zero = true;
+    memset(syndromes, 0, min_distance * sizeof(field_element_t));
+    for (unsigned int i = 0; i < min_distance; i++) {
+        // profiling reveals that this function takes about 50% of the cpu time of
+        // decoding. so, in order to speed it up a little, we precompute and save
+        // the successive powers of the roots of the generator, which are
+        // located in generator_root_exp
+        field_element_t eval = polynomial_eval_lut(field, msgpoly, generator_root_exp[i]);
+        if (eval) {
+            all_zero = false;
+        }
+        syndromes[i] = eval;
+    }
+    return all_zero;
+}
+
+// Berlekamp-Massey algorithm to find LFSR that describes syndromes
+// returns number of errors and writes the error locator polynomial to rs->error_locator
+static unsigned int reed_solomon_find_error_locator(correct_reed_solomon *rs, size_t num_erasures) {
+    unsigned int numerrors = 0;
+
+    memset(rs->error_locator.coeff, 0, (rs->min_distance + 1) * sizeof(field_element_t));
+
+    // initialize to f(x) = 1
+    rs->error_locator.coeff[0] = 1;
+    rs->error_locator.order = 0;
+
+    memcpy(rs->last_error_locator.coeff, rs->error_locator.coeff, (rs->min_distance + 1) * sizeof(field_element_t));
+    rs->last_error_locator.order = rs->error_locator.order;
+
+    field_element_t discrepancy;
+    field_element_t last_discrepancy = 1;
+    unsigned int delay_length = 1;
+
+    for (unsigned int i = rs->error_locator.order; i < rs->min_distance - num_erasures; i++) {
+        discrepancy = rs->syndromes[i];
+        for (unsigned int j = 1; j <= numerrors; j++) {
+            discrepancy = field_add(rs->field, discrepancy,
+                                    field_mul(rs->field, rs->error_locator.coeff[j], rs->syndromes[i - j]));
+        }
+
+        if (!discrepancy) {
+            // our existing LFSR describes the new syndrome as well
+            // leave it as-is but update the number of delay elements
+            //   so that if a discrepancy occurs later we can eliminate it
+            delay_length++;
+            continue;
+        }
+
+        if (2 * numerrors <= i) {
+            // there's a discrepancy, but we still have room for more taps
+            // lengthen LFSR by one tap and set weight to eliminate discrepancy
+
+            // shift the last locator by the delay length, multiply by discrepancy,
+            //   and divide by the last discrepancy
+            // we move down because we're shifting up, and this prevents overwriting
+            for (int j = rs->last_error_locator.order; j >= 0; j--) {
+                // the bounds here will be ok since we have a headroom of numerrors
+                rs->last_error_locator.coeff[j + delay_length] = field_div(
+                    rs->field, field_mul(rs->field, rs->last_error_locator.coeff[j], discrepancy), last_discrepancy);
+            }
+            for (int j = delay_length - 1; j >= 0; j--) {
+                rs->last_error_locator.coeff[j] = 0;
+            }
+
+            // locator = locator - last_locator
+            // we will also update last_locator to be locator before this loop takes place
+            field_element_t temp;
+            for (int j = 0; j <= (rs->last_error_locator.order + delay_length); j++) {
+                temp = rs->error_locator.coeff[j];
+                rs->error_locator.coeff[j] =
+                    field_add(rs->field, rs->error_locator.coeff[j], rs->last_error_locator.coeff[j]);
+                rs->last_error_locator.coeff[j] = temp;
+            }
+            unsigned int temp_order = rs->error_locator.order;
+            rs->error_locator.order = rs->last_error_locator.order + delay_length;
+            rs->last_error_locator.order = temp_order;
+
+            // now last_locator is locator before we started,
+            //   and locator is (locator - (discrepancy/last_discrepancy) * x^(delay_length) * last_locator)
+
+            numerrors = i + 1 - numerrors;
+            last_discrepancy = discrepancy;
+            delay_length = 1;
+            continue;
+        }
+
+        // no more taps
+        // unlike the previous case, we are preserving last locator,
+        //    but we'll update locator as before
+        // we're basically flattening the two loops from the previous case because
+        //    we no longer need to update last_locator
+        for (int j = rs->last_error_locator.order; j >= 0; j--) {
+            rs->error_locator.coeff[j + delay_length] =
+                field_add(rs->field, rs->error_locator.coeff[j + delay_length],
+                          field_div(rs->field, field_mul(rs->field, rs->last_error_locator.coeff[j], discrepancy),
+                                    last_discrepancy));
+        }
+        rs->error_locator.order = (rs->last_error_locator.order + delay_length > rs->error_locator.order)
+                                      ? rs->last_error_locator.order + delay_length
+                                      : rs->error_locator.order;
+        delay_length++;
+    }
+    return rs->error_locator.order;
+}
+
+// find the roots of the error locator polynomial
+// Chien search
+bool reed_solomon_factorize_error_locator(field_t field, unsigned int num_skip, polynomial_t locator_log, field_element_t *roots,
+                                          field_logarithm_t **element_exp) {
+    // normally it'd be tricky to find all the roots
+    // but, the finite field is awfully finite...
+    // just brute force search across every field element
+    unsigned int root = num_skip;
+    memset(roots + num_skip, 0, (locator_log.order) * sizeof(field_element_t));
+    for (field_operation_t i = 0; i < 256; i++) {
+        // we make two optimizations here to help this search go faster
+        // a) we have precomputed the first successive powers of every single element
+        //   in the field. we need at most n powers, where n is the largest possible
+        //   degree of the error locator
+        // b) we have precomputed the error locator polynomial in log form, which
+        //   helps reduce some lookups that would be done here
+        if (!polynomial_eval_log_lut(field, locator_log, element_exp[i])) {
+            roots[root] = (field_element_t)i;
+            root++;
+        }
+    }
+    // this is where we find out if we are have too many errors to recover from
+    // berlekamp-massey may have built an error locator that has 0 discrepancy
+    // on the syndromes but doesn't have enough roots
+    return root == locator_log.order + num_skip;
+}
+
+// use error locator and syndromes to find the error evaluator polynomial
+void reed_solomon_find_error_evaluator(field_t field, polynomial_t locator, polynomial_t syndromes,
+                                       polynomial_t error_evaluator) {
+    // the error evaluator, omega(x), is S(x)*Lamba(x) mod x^(2t)
+    // where S(x) is a polynomial constructed from the syndromes
+    //   S(1) + S(2)*x + ... + S(2t)*x(2t - 1)
+    // and Lambda(x) is the error locator
+    // the modulo is implicit here -- we have limited the max length of error_evaluator,
+    //   which polynomial_mul will interpret to mean that it should not compute
+    //   powers larger than that, which is the same as performing mod x^(2t)
+    polynomial_mul(field, locator, syndromes, error_evaluator);
+}
+
+// use error locator, error roots and syndromes to find the error values
+// that is, the elements in the finite field which can be added to the received
+//   polynomial at the locations of the error roots in order to produce the
+//   transmitted polynomial
+// forney algorithm
+void reed_solomon_find_error_values(correct_reed_solomon *rs) {
+    // error value e(j) = -(X(j)^(1-c) * omega(X(j)^-1))/(lambda'(X(j)^-1))
+    // where X(j)^-1 is a root of the error locator, omega(X) is the error evaluator,
+    //   lambda'(X) is the first formal derivative of the error locator,
+    //   and c is the first consecutive root of the generator used in encoding
+
+    // first find omega(X), the error evaluator
+    // we generate S(x), the polynomial constructed from the roots of the syndromes
+    // this is *not* the polynomial constructed by expanding the products of roots
+    // S(x) = S(1) + S(2)*x + ... + S(2t)*x(2t - 1)
+    polynomial_t syndrome_poly;
+    syndrome_poly.order = rs->min_distance - 1;
+    syndrome_poly.coeff = rs->syndromes;
+    memset(rs->error_evaluator.coeff, 0, (rs->error_evaluator.order + 1) * sizeof(field_element_t));
+    reed_solomon_find_error_evaluator(rs->field, rs->error_locator, syndrome_poly, rs->error_evaluator);
+
+    // now find lambda'(X)
+    rs->error_locator_derivative.order = rs->error_locator.order - 1;
+    polynomial_formal_derivative(rs->field, rs->error_locator, rs->error_locator_derivative);
+
+    // calculate each e(j)
+    for (unsigned int i = 0; i < rs->error_locator.order; i++) {
+        if (rs->error_roots[i] == 0) {
+            continue;
+        }
+        rs->error_vals[i] = field_mul(
+            rs->field, field_pow(rs->field, rs->error_roots[i], rs->first_consecutive_root - 1),
+            field_div(
+                rs->field, polynomial_eval_lut(rs->field, rs->error_evaluator, rs->element_exp[rs->error_roots[i]]),
+                polynomial_eval_lut(rs->field, rs->error_locator_derivative, rs->element_exp[rs->error_roots[i]])));
+    }
+}
+
+void reed_solomon_find_error_locations(field_t field, field_logarithm_t generator_root_gap,
+                                       field_element_t *error_roots, field_logarithm_t *error_locations,
+                                       unsigned int num_errors, unsigned int num_skip) {
+    for (unsigned int i = 0; i < num_errors; i++) {
+        // the error roots are the reciprocals of the error locations, so div 1 by them
+
+        // we do mod 255 here because the log table aliases at index 1
+        // the log of 1 is both 0 and 255 (alpha^255 = alpha^0 = 1)
+        // for most uses it makes sense to have log(1) = 255, but in this case
+        // we're interested in a byte index, and the 255th index is not even valid
+        // just wrap it back to 0
+
+        if (error_roots[i] == 0) {
+            continue;
+        }
+
+        field_operation_t loc = field_div(field, 1, error_roots[i]);
+        for (field_operation_t j = 0; j < 256; j++) {
+            if (field_pow(field, j, generator_root_gap) == loc) {
+                error_locations[i] = field.log[j];
+                break;
+            }
+        }
+    }
+}
+
+// erasure method -- take given locations and convert to roots
+// this is the inverse of reed_solomon_find_error_locations
+static void reed_solomon_find_error_roots_from_locations(field_t field, field_logarithm_t generator_root_gap,
+                                                         const field_logarithm_t *error_locations,
+                                                         field_element_t *error_roots, unsigned int num_errors) {
+    for (unsigned int i = 0; i < num_errors; i++) {
+        field_element_t loc = field_pow(field, field.exp[error_locations[i]], generator_root_gap);
+        // field_element_t loc = field.exp[error_locations[i]];
+        error_roots[i] = field_div(field, 1, loc);
+        // error_roots[i] = loc;
+    }
+}
+
+// erasure method -- given the roots of the error locator, create the polynomial
+static polynomial_t reed_solomon_find_error_locator_from_roots(field_t field, unsigned int num_errors,
+                                                               field_element_t *error_roots,
+                                                               polynomial_t error_locator,
+                                                               polynomial_t *scratch) {
+    // multiply out roots to build the error locator polynomial
+    return polynomial_init_from_roots(field, num_errors, error_roots, error_locator, scratch);
+}
+
+// erasure method
+static void reed_solomon_find_modified_syndromes(correct_reed_solomon *rs, field_element_t *syndromes, polynomial_t error_locator, field_element_t *modified_syndromes) {
+    polynomial_t syndrome_poly;
+    syndrome_poly.order = rs->min_distance - 1;
+    syndrome_poly.coeff = syndromes;
+
+    polynomial_t modified_syndrome_poly;
+    modified_syndrome_poly.order = rs->min_distance - 1;
+    modified_syndrome_poly.coeff = modified_syndromes;
+
+    polynomial_mul(rs->field, error_locator, syndrome_poly, modified_syndrome_poly);
+}
+
+void correct_reed_solomon_decoder_create(correct_reed_solomon *rs) {
+    rs->has_init_decode = true;
+    rs->syndromes = calloc(rs->min_distance, sizeof(field_element_t));
+    rs->modified_syndromes = calloc(2 * rs->min_distance, sizeof(field_element_t));
+    rs->received_polynomial = polynomial_create(rs->block_length - 1);
+    rs->error_locator = polynomial_create(rs->min_distance);
+    rs->error_locator_log = polynomial_create(rs->min_distance);
+    rs->erasure_locator = polynomial_create(rs->min_distance);
+    rs->error_roots = calloc(2 * rs->min_distance, sizeof(field_element_t));
+    rs->error_vals = malloc(rs->min_distance * sizeof(field_element_t));
+    rs->error_locations = malloc(rs->min_distance * sizeof(field_logarithm_t));
+
+    rs->last_error_locator = polynomial_create(rs->min_distance);
+    rs->error_evaluator = polynomial_create(rs->min_distance - 1);
+    rs->error_locator_derivative = polynomial_create(rs->min_distance - 1);
+
+    // calculate and store the first block_length powers of every generator root
+    // we would have to do this work in order to calculate the syndromes
+    // if we save it, we can prevent the need to recalculate it on subsequent calls
+    // total memory usage is min_distance * block_length bytes e.g. 32 * 255 ~= 8k
+    rs->generator_root_exp = malloc(rs->min_distance * sizeof(field_logarithm_t *));
+    for (unsigned int i = 0; i < rs->min_distance; i++) {
+        rs->generator_root_exp[i] = malloc(rs->block_length * sizeof(field_logarithm_t));
+        polynomial_build_exp_lut(rs->field, rs->generator_roots[i], rs->block_length - 1, rs->generator_root_exp[i]);
+    }
+
+    // calculate and store the first min_distance powers of every element in the field
+    // we would have to do this for chien search anyway, and its size is only 256 * min_distance bytes
+    // for min_distance = 32 this is 8k of memory, a pittance for the speedup we receive in exchange
+    // we also get to reuse this work during error value calculation
+    rs->element_exp = malloc(256 * sizeof(field_logarithm_t *));
+    for (field_operation_t i = 0; i < 256; i++) {
+        rs->element_exp[i] = malloc(rs->min_distance * sizeof(field_logarithm_t));
+        polynomial_build_exp_lut(rs->field, i, rs->min_distance - 1, rs->element_exp[i]);
+    }
+
+    rs->init_from_roots_scratch[0] = polynomial_create(rs->min_distance);
+    rs->init_from_roots_scratch[1] = polynomial_create(rs->min_distance);
+}
+
+ssize_t correct_reed_solomon_decode(correct_reed_solomon *rs, const uint8_t *encoded, size_t encoded_length,
+                                    uint8_t *msg) {
+    if (encoded_length > rs->block_length) {
+        return -1;
+    }
+
+    // the message is the non-remainder part
+    size_t msg_length = encoded_length - rs->min_distance;
+    // if they handed us a nonfull block, we'll write in 0s
+    size_t pad_length = rs->block_length - encoded_length;
+
+    if (!rs->has_init_decode) {
+        // initialize rs for decoding
+        correct_reed_solomon_decoder_create(rs);
+    }
+
+    // we need to copy to our local buffer
+    // the buffer we're given has the coordinates in the wrong direction
+    // e.g. byte 0 corresponds to the 254th order coefficient
+    // so we're going to flip and then write padding
+    // the final copied buffer will look like
+    // | rem (rs->min_distance) | msg (msg_length) | pad (pad_length) |
+
+    for (unsigned int i = 0; i < encoded_length; i++) {
+        rs->received_polynomial.coeff[i] = encoded[encoded_length - (i + 1)];
+    }
+
+    // fill the pad_length with 0s
+    for (unsigned int i = 0; i < pad_length; i++) {
+        rs->received_polynomial.coeff[i + encoded_length] = 0;
+    }
+
+
+    bool all_zero = reed_solomon_find_syndromes(rs->field, rs->received_polynomial, rs->generator_root_exp,
+                                                rs->syndromes, rs->min_distance);
+
+    if (all_zero) {
+        // syndromes were all zero, so there was no error in the message
+        // copy to msg and we are done
+        for (unsigned int i = 0; i < msg_length; i++) {
+            msg[i] = rs->received_polynomial.coeff[encoded_length - (i + 1)];
+        }
+        return msg_length;
+    }
+
+    unsigned int order = reed_solomon_find_error_locator(rs, 0);
+    // XXX fix this vvvv
+    rs->error_locator.order = order;
+
+    for (unsigned int i = 0; i <= rs->error_locator.order; i++) {
+        // this is a little strange since the coeffs are logs, not elements
+        // also, we'll be storing log(0) = 0 for any 0 coeffs in the error locator
+        // that would seem bad but we'll just be using this in chien search, and we'll skip all 0 coeffs
+        // (you might point out that log(1) also = 0, which would seem to alias. however, that's ok,
+        //   because log(1) = 255 as well, and in fact that's how it's represented in our log table)
+        rs->error_locator_log.coeff[i] = rs->field.log[rs->error_locator.coeff[i]];
+    }
+    rs->error_locator_log.order = rs->error_locator.order;
+
+    if (!reed_solomon_factorize_error_locator(rs->field, 0, rs->error_locator_log, rs->error_roots, rs->element_exp)) {
+        // roots couldn't be found, so there were too many errors to deal with
+        // RS has failed for this message
+        return -1;
+    }
+
+    reed_solomon_find_error_locations(rs->field, rs->generator_root_gap, rs->error_roots, rs->error_locations,
+                                      rs->error_locator.order, 0);
+
+    reed_solomon_find_error_values(rs);
+
+    for (unsigned int i = 0; i < rs->error_locator.order; i++) {
+        rs->received_polynomial.coeff[rs->error_locations[i]] =
+            field_sub(rs->field, rs->received_polynomial.coeff[rs->error_locations[i]], rs->error_vals[i]);
+    }
+
+    for (unsigned int i = 0; i < msg_length; i++) {
+        msg[i] = rs->received_polynomial.coeff[encoded_length - (i + 1)];
+    }
+
+    return msg_length;
+}
+
+ssize_t correct_reed_solomon_decode_with_erasures(correct_reed_solomon *rs, const uint8_t *encoded,
+                                                  size_t encoded_length, const uint8_t *erasure_locations,
+                                                  size_t erasure_length, uint8_t *msg) {
+    if (!erasure_length) {
+        return correct_reed_solomon_decode(rs, encoded, encoded_length, msg);
+    }
+
+    if (encoded_length > rs->block_length) {
+        return -1;
+    }
+
+    if (erasure_length > rs->min_distance) {
+        return -1;
+    }
+
+    // the message is the non-remainder part
+    size_t msg_length = encoded_length - rs->min_distance;
+    // if they handed us a nonfull block, we'll write in 0s
+    size_t pad_length = rs->block_length - encoded_length;
+
+    if (!rs->has_init_decode) {
+        // initialize rs for decoding
+        correct_reed_solomon_decoder_create(rs);
+    }
+
+    // we need to copy to our local buffer
+    // the buffer we're given has the coordinates in the wrong direction
+    // e.g. byte 0 corresponds to the 254th order coefficient
+    // so we're going to flip and then write padding
+    // the final copied buffer will look like
+    // | rem (rs->min_distance) | msg (msg_length) | pad (pad_length) |
+
+    for (unsigned int i = 0; i < encoded_length; i++) {
+        rs->received_polynomial.coeff[i] = encoded[encoded_length - (i + 1)];
+    }
+
+    // fill the pad_length with 0s
+    for (unsigned int i = 0; i < pad_length; i++) {
+        rs->received_polynomial.coeff[i + encoded_length] = 0;
+    }
+
+    for (unsigned int i = 0; i < erasure_length; i++) {
+        // remap the coordinates of the erasures
+        rs->error_locations[i] = rs->block_length - (erasure_locations[i] + pad_length + 1);
+    }
+
+    reed_solomon_find_error_roots_from_locations(rs->field, rs->generator_root_gap, rs->error_locations,
+                                                 rs->error_roots, erasure_length);
+
+    rs->erasure_locator =
+        reed_solomon_find_error_locator_from_roots(rs->field, erasure_length, rs->error_roots, rs->erasure_locator, rs->init_from_roots_scratch);
+
+    bool all_zero = reed_solomon_find_syndromes(rs->field, rs->received_polynomial, rs->generator_root_exp,
+                                                rs->syndromes, rs->min_distance);
+
+    if (all_zero) {
+        // syndromes were all zero, so there was no error in the message
+        // copy to msg and we are done
+        for (unsigned int i = 0; i < msg_length; i++) {
+            msg[i] = rs->received_polynomial.coeff[encoded_length - (i + 1)];
+        }
+        return msg_length;
+    }
+
+    reed_solomon_find_modified_syndromes(rs, rs->syndromes, rs->erasure_locator, rs->modified_syndromes);
+
+    field_element_t *syndrome_copy = malloc(rs->min_distance * sizeof(field_element_t));
+    memcpy(syndrome_copy, rs->syndromes, rs->min_distance * sizeof(field_element_t));
+
+    for (unsigned int i = erasure_length; i < rs->min_distance; i++) {
+        rs->syndromes[i - erasure_length] = rs->modified_syndromes[i];
+    }
+
+    unsigned int order = reed_solomon_find_error_locator(rs, erasure_length);
+    // XXX fix this vvvv
+    rs->error_locator.order = order;
+
+    for (unsigned int i = 0; i <= rs->error_locator.order; i++) {
+        // this is a little strange since the coeffs are logs, not elements
+        // also, we'll be storing log(0) = 0 for any 0 coeffs in the error locator
+        // that would seem bad but we'll just be using this in chien search, and we'll skip all 0 coeffs
+        // (you might point out that log(1) also = 0, which would seem to alias. however, that's ok,
+        //   because log(1) = 255 as well, and in fact that's how it's represented in our log table)
+        rs->error_locator_log.coeff[i] = rs->field.log[rs->error_locator.coeff[i]];
+    }
+    rs->error_locator_log.order = rs->error_locator.order;
+
+    /*
+    for (unsigned int i = 0; i < erasure_length; i++) {
+        rs->error_roots[i] = field_div(rs->field, 1, rs->error_roots[i]);
+    }
+    */
+
+    if (!reed_solomon_factorize_error_locator(rs->field, erasure_length, rs->error_locator_log, rs->error_roots, rs->element_exp)) {
+        // roots couldn't be found, so there were too many errors to deal with
+        // RS has failed for this message
+        free(syndrome_copy);
+        return -1;
+    }
+
+    polynomial_t temp_poly = polynomial_create(rs->error_locator.order + erasure_length);
+    polynomial_mul(rs->field, rs->erasure_locator, rs->error_locator, temp_poly);
+    polynomial_t placeholder_poly = rs->error_locator;
+    rs->error_locator = temp_poly;
+
+    reed_solomon_find_error_locations(rs->field, rs->generator_root_gap, rs->error_roots, rs->error_locations,
+                                      rs->error_locator.order, erasure_length);
+
+    memcpy(rs->syndromes, syndrome_copy, rs->min_distance * sizeof(field_element_t));
+
+    reed_solomon_find_error_values(rs);
+
+    for (unsigned int i = 0; i < rs->error_locator.order; i++) {
+        rs->received_polynomial.coeff[rs->error_locations[i]] =
+            field_sub(rs->field, rs->received_polynomial.coeff[rs->error_locations[i]], rs->error_vals[i]);
+    }
+
+    rs->error_locator = placeholder_poly;
+
+    for (unsigned int i = 0; i < msg_length; i++) {
+        msg[i] = rs->received_polynomial.coeff[encoded_length - (i + 1)];
+    }
+
+    polynomial_destroy(temp_poly);
+    free(syndrome_copy);
+
+    return msg_length;
+}
--- a/core/libcorrect/src/reed-solomon/encode.c
+++ b/core/libcorrect/src/reed-solomon/encode.c
@@ -0,0 +1,34 @@
+#include "correct/reed-solomon/encode.h"
+
+ssize_t correct_reed_solomon_encode(correct_reed_solomon *rs, const uint8_t *msg, size_t msg_length, uint8_t *encoded) {
+    if (msg_length > rs->message_length) {
+        return -1;
+    }
+
+    size_t pad_length = rs->message_length - msg_length;
+    for (unsigned int i = 0; i < msg_length; i++) {
+        // message goes from high order to low order but libcorrect polynomials go low to high
+        // so we reverse on the way in and on the way out
+        // we'd have to do a copy anyway so this reversal should be free
+        rs->encoded_polynomial.coeff[rs->encoded_polynomial.order - (i + pad_length)] = msg[i];
+    }
+
+    // 0-fill the rest of the coefficients -- this length will always be > 0
+    // because the order of this poly is block_length and the msg_length <= message_length
+    // e.g. 255 and 223
+    memset(rs->encoded_polynomial.coeff + (rs->encoded_polynomial.order + 1 - pad_length), 0, pad_length);
+    memset(rs->encoded_polynomial.coeff, 0, (rs->encoded_polynomial.order + 1 - rs->message_length));
+
+    polynomial_mod(rs->field, rs->encoded_polynomial, rs->generator, rs->encoded_remainder);
+
+    // now return byte order to highest order to lowest order
+    for (unsigned int i = 0; i < msg_length; i++) {
+        encoded[i] = rs->encoded_polynomial.coeff[rs->encoded_polynomial.order - (i + pad_length)];
+    }
+
+    for (unsigned int i = 0; i < rs->min_distance; i++) {
+        encoded[msg_length + i] = rs->encoded_remainder.coeff[rs->min_distance - (i + 1)];
+    }
+
+    return rs->block_length;
+}
--- a/core/libcorrect/src/reed-solomon/polynomial.c
+++ b/core/libcorrect/src/reed-solomon/polynomial.c
@@ -0,0 +1,255 @@
+#include "correct/reed-solomon/polynomial.h"
+
+polynomial_t polynomial_create(unsigned int order) {
+    polynomial_t polynomial;
+    polynomial.coeff = malloc(sizeof(field_element_t) * (order + 1));
+    polynomial.order = order;
+    return polynomial;
+}
+
+void polynomial_destroy(polynomial_t polynomial) {
+    free(polynomial.coeff);
+}
+
+// if you want a full multiplication, then make res.order = l.order + r.order
+// but if you just care about a lower order, e.g. mul mod x^i, then you can select
+//    fewer coefficients
+void polynomial_mul(field_t field, polynomial_t l, polynomial_t r, polynomial_t res) {
+    // perform an element-wise multiplication of two polynomials
+    memset(res.coeff, 0, sizeof(field_element_t) * (res.order + 1));
+    for (unsigned int i = 0; i <= l.order; i++) {
+        if (i > res.order) {
+            continue;
+        }
+        unsigned int j_limit = (r.order > res.order - i) ? res.order - i : r.order;
+        for (unsigned int j = 0; j <= j_limit; j++) {
+            // e.g. alpha^5*x * alpha^37*x^2 --> alpha^42*x^3
+            res.coeff[i + j] = field_add(field, res.coeff[i + j], field_mul(field, l.coeff[i], r.coeff[j]));
+        }
+    }
+}
+
+void polynomial_mod(field_t field, polynomial_t dividend, polynomial_t divisor, polynomial_t mod) {
+    // find the polynomial remainder of dividend mod divisor
+    // do long division and return just the remainder (written to mod)
+
+    if (mod.order < dividend.order) {
+        // mod.order must be >= dividend.order (scratch space needed)
+        // this is an error -- catch it in debug?
+        return;
+    }
+    // initialize remainder as dividend
+    memcpy(mod.coeff, dividend.coeff, sizeof(field_element_t) * (dividend.order + 1));
+
+    // XXX make sure divisor[divisor_order] is nonzero
+    field_logarithm_t divisor_leading = field.log[divisor.coeff[divisor.order]];
+    // long division steps along one order at a time, starting at the highest order
+    for (unsigned int i = dividend.order; i > 0; i--) {
+        // look at the leading coefficient of dividend and divisor
+        // if leading coefficient of dividend / leading coefficient of divisor is q
+        //   then the next row of subtraction will be q * divisor
+        // if order of q < 0 then what we have is the remainder and we are done
+        if (i < divisor.order) {
+            break;
+        }
+        if (mod.coeff[i] == 0) {
+            continue;
+        }
+        unsigned int q_order = i - divisor.order;
+        field_logarithm_t q_coeff = field_div_log(field, field.log[mod.coeff[i]], divisor_leading);
+
+        // now that we've chosen q, multiply the divisor by q and subtract from
+        //   our remainder. subtracting in GF(2^8) is XOR, just like addition
+        for (unsigned int j = 0; j <= divisor.order; j++) {
+            if (divisor.coeff[j] == 0) {
+                continue;
+            }
+            // all of the multiplication is shifted up by q_order places
+            mod.coeff[j + q_order] = field_add(field, mod.coeff[j + q_order],
+                        field_mul_log_element(field, field.log[divisor.coeff[j]], q_coeff));
+        }
+    }
+}
+
+void polynomial_formal_derivative(field_t field, polynomial_t poly, polynomial_t der) {
+    // if f(x) = a(n)*x^n + ... + a(1)*x + a(0)
+    // then f'(x) = n*a(n)*x^(n-1) + ... + 2*a(2)*x + a(1)
+    // where n*a(n) = sum(k=1, n, a(n)) e.g. the nth sum of a(n) in GF(2^8)
+
+    // assumes der.order = poly.order - 1
+    memset(der.coeff, 0, sizeof(field_element_t) * (der.order + 1));
+    for (unsigned int i = 0; i <= der.order; i++) {
+        // we're filling in the ith power of der, so we look ahead one power in poly
+        // f(x) = a(i + 1)*x^(i + 1) -> f'(x) = (i + 1)*a(i + 1)*x^i
+        // where (i + 1)*a(i + 1) is the sum of a(i + 1) (i + 1) times, not the product
+        der.coeff[i] = field_sum(field, poly.coeff[i + 1], i + 1);
+    }
+}
+
+field_element_t polynomial_eval(field_t field, polynomial_t poly, field_element_t val) {
+    // evaluate the polynomial poly at a particular element val
+    if (val == 0) {
+        return poly.coeff[0];
+    }
+
+    field_element_t res = 0;
+
+    // we're going to start at 0th order and multiply by val each time
+    field_logarithm_t val_exponentiated = field.log[1];
+    field_logarithm_t val_log = field.log[val];
+
+    for (unsigned int i = 0; i <= poly.order; i++) {
+        if (poly.coeff[i] != 0) {
+            // multiply-accumulate by the next coeff times the next power of val
+            res = field_add(field, res,
+                    field_mul_log_element(field, field.log[poly.coeff[i]], val_exponentiated));
+        }
+        // now advance to the next power
+        val_exponentiated = field_mul_log(field, val_exponentiated, val_log);
+    }
+    return res;
+}
+
+field_element_t polynomial_eval_lut(field_t field, polynomial_t poly, const field_logarithm_t *val_exp) {
+    // evaluate the polynomial poly at a particular element val
+    // in this case, all of the logarithms of the successive powers of val have been precalculated
+    // this removes the extra work we'd have to do to calculate val_exponentiated each time
+    //   if this function is to be called on the same val multiple times
+    if (val_exp[0] == 0) {
+        return poly.coeff[0];
+    }
+
+    field_element_t res = 0;
+
+    for (unsigned int i = 0; i <= poly.order; i++) {
+        if (poly.coeff[i] != 0) {
+            // multiply-accumulate by the next coeff times the next power of val
+            res = field_add(field, res,
+                    field_mul_log_element(field, field.log[poly.coeff[i]], val_exp[i]));
+        }
+    }
+    return res;
+}
+
+field_element_t polynomial_eval_log_lut(field_t field, polynomial_t poly_log, const field_logarithm_t *val_exp) {
+    // evaluate the log_polynomial poly at a particular element val
+    // like polynomial_eval_lut, the logarithms of the successive powers of val have been
+    //   precomputed
+    if (val_exp[0] == 0) {
+        if (poly_log.coeff[0] == 0) {
+            // special case for the non-existant log case
+            return 0;
+        }
+        return field.exp[poly_log.coeff[0]];
+    }
+
+    field_element_t res = 0;
+
+    for (unsigned int i = 0; i <= poly_log.order; i++) {
+        // using 0 as a sentinel value in log -- log(0) is really -inf
+        if (poly_log.coeff[i] != 0) {
+            // multiply-accumulate by the next coeff times the next power of val
+            res = field_add(field, res,
+                    field_mul_log_element(field, poly_log.coeff[i], val_exp[i]));
+        }
+    }
+    return res;
+}
+
+void polynomial_build_exp_lut(field_t field, field_element_t val, unsigned int order, field_logarithm_t *val_exp) {
+    // create the lookup table of successive powers of val used by polynomial_eval_lut
+    field_logarithm_t val_exponentiated = field.log[1];
+    field_logarithm_t val_log = field.log[val];
+    for (unsigned int i = 0; i <= order; i++) {
+        if (val == 0) {
+            val_exp[i] = 0;
+        } else {
+            val_exp[i] = val_exponentiated;
+            val_exponentiated = field_mul_log(field, val_exponentiated, val_log);
+        }
+    }
+}
+
+polynomial_t polynomial_init_from_roots(field_t field, unsigned int nroots, field_element_t *roots, polynomial_t poly, polynomial_t *scratch) {
+    unsigned int order = nroots;
+    polynomial_t l;
+    field_element_t l_coeff[2];
+    l.order = 1;
+    l.coeff = l_coeff;
+
+    // we'll keep two temporary stores of rightside polynomial
+    // each time through the loop, we take the previous result and use it as new rightside
+    // swap back and forth (prevents the need for a copy)
+    polynomial_t r[2];
+    r[0] = scratch[0];
+    r[1] = scratch[1];
+    unsigned int rcoeffres = 0;
+
+    // initialize the result with x + roots[0]
+    r[rcoeffres].coeff[1] = 1;
+    r[rcoeffres].coeff[0] = roots[0];
+    r[rcoeffres].order = 1;
+
+    // initialize lcoeff[1] with x
+    // we'll fill in the 0th order term in each loop iter
+    l.coeff[1] = 1;
+
+    // loop through, using previous run's result as the new right hand side
+    // this allows us to multiply one group at a time
+    for (unsigned int i = 1; i < nroots; i++) {
+        l.coeff[0] = roots[i];
+        unsigned int nextrcoeff = rcoeffres;
+        rcoeffres = (rcoeffres + 1) % 2;
+        r[rcoeffres].order = i + 1;
+        polynomial_mul(field, l, r[nextrcoeff], r[rcoeffres]);
+    }
+
+    memcpy(poly.coeff, r[rcoeffres].coeff, (order + 1) * sizeof(field_element_t));
+    poly.order = order;
+
+    return poly;
+}
+
+polynomial_t polynomial_create_from_roots(field_t field, unsigned int nroots, field_element_t *roots) {
+    polynomial_t poly = polynomial_create(nroots);
+    unsigned int order = nroots;
+    polynomial_t l;
+    l.order = 1;
+    l.coeff = calloc(2, sizeof(field_element_t));
+
+    polynomial_t r[2];
+    // we'll keep two temporary stores of rightside polynomial
+    // each time through the loop, we take the previous result and use it as new rightside
+    // swap back and forth (prevents the need for a copy)
+    r[0].coeff = calloc(order + 1, sizeof(field_element_t));
+    r[1].coeff = calloc(order + 1, sizeof(field_element_t));
+    unsigned int rcoeffres = 0;
+
+    // initialize the result with x + roots[0]
+    r[rcoeffres].coeff[0] = roots[0];
+    r[rcoeffres].coeff[1] = 1;
+    r[rcoeffres].order = 1;
+
+    // initialize lcoeff[1] with x
+    // we'll fill in the 0th order term in each loop iter
+    l.coeff[1] = 1;
+
+    // loop through, using previous run's result as the new right hand side
+    // this allows us to multiply one group at a time
+    for (unsigned int i = 1; i < nroots; i++) {
+        l.coeff[0] = roots[i];
+        unsigned int nextrcoeff = rcoeffres;
+        rcoeffres = (rcoeffres + 1) % 2;
+        r[rcoeffres].order = i + 1;
+        polynomial_mul(field, l, r[nextrcoeff], r[rcoeffres]);
+    }
+
+    memcpy(poly.coeff, r[rcoeffres].coeff, (order + 1) * sizeof(field_element_t));
+    poly.order = order;
+
+    free(l.coeff);
+    free(r[0].coeff);
+    free(r[1].coeff);
+
+    return poly;
+}
--- a/core/libcorrect/src/reed-solomon/reed-solomon.c
+++ b/core/libcorrect/src/reed-solomon/reed-solomon.c
@@ -0,0 +1,187 @@
+#include "correct/reed-solomon/reed-solomon.h"
+
+// coeff must be of size nroots + 1
+// e.g. 2 roots (x + alpha)(x + alpha^2) yields a poly with 3 terms x^2 + g0*x + g1
+static polynomial_t reed_solomon_build_generator(field_t field, unsigned int nroots, field_element_t first_consecutive_root, unsigned int root_gap, polynomial_t generator, field_element_t *roots) {
+    // generator has order 2*t
+    // of form (x + alpha^1)(x + alpha^2)...(x - alpha^2*t)
+    for (unsigned int i = 0; i < nroots; i++) {
+        roots[i] = field.exp[(root_gap * (i + first_consecutive_root)) % 255];
+    }
+    return polynomial_create_from_roots(field, nroots, roots);
+}
+
+correct_reed_solomon *correct_reed_solomon_create(field_operation_t primitive_polynomial, field_logarithm_t first_consecutive_root, field_logarithm_t generator_root_gap, size_t num_roots) {
+    correct_reed_solomon *rs = calloc(1, sizeof(correct_reed_solomon));
+    rs->field = field_create(primitive_polynomial);
+
+    rs->block_length = 255;
+    rs->min_distance = num_roots;
+    rs->message_length = rs->block_length - rs->min_distance;
+
+    rs->first_consecutive_root = first_consecutive_root;
+    rs->generator_root_gap = generator_root_gap;
+
+    rs->generator_roots = malloc(rs->min_distance * sizeof(field_element_t));
+
+    rs->generator = reed_solomon_build_generator(rs->field, rs->min_distance, rs->first_consecutive_root, rs->generator_root_gap, rs->generator, rs->generator_roots);
+
+    rs->encoded_polynomial = polynomial_create(rs->block_length - 1);
+    rs->encoded_remainder = polynomial_create(rs->block_length - 1);
+
+    rs->has_init_decode = false;
+
+    return rs;
+}
+
+void correct_reed_solomon_destroy(correct_reed_solomon *rs) {
+    field_destroy(rs->field);
+    polynomial_destroy(rs->generator);
+    free(rs->generator_roots);
+    polynomial_destroy(rs->encoded_polynomial);
+    polynomial_destroy(rs->encoded_remainder);
+    if (rs->has_init_decode) {
+        free(rs->syndromes);
+        free(rs->modified_syndromes);
+        polynomial_destroy(rs->received_polynomial);
+        polynomial_destroy(rs->error_locator);
+        polynomial_destroy(rs->error_locator_log);
+        polynomial_destroy(rs->erasure_locator);
+        free(rs->error_roots);
+        free(rs->error_vals);
+        free(rs->error_locations);
+        polynomial_destroy(rs->last_error_locator);
+        polynomial_destroy(rs->error_evaluator);
+        polynomial_destroy(rs->error_locator_derivative);
+        for (unsigned int i = 0; i < rs->min_distance; i++) {
+            free(rs->generator_root_exp[i]);
+        }
+        free(rs->generator_root_exp);
+        for (field_operation_t i = 0; i < 256; i++) {
+            free(rs->element_exp[i]);
+        }
+        free(rs->element_exp);
+        polynomial_destroy(rs->init_from_roots_scratch[0]);
+        polynomial_destroy(rs->init_from_roots_scratch[1]);
+    }
+    free(rs);
+}
+
+void correct_reed_solomon_debug_print(correct_reed_solomon *rs) {
+    for (unsigned int i = 0; i < 256; i++) {
+        printf("%3d  %3d    %3d  %3d\n", i, rs->field.exp[i], i, rs->field.log[i]);
+    }
+    printf("\n");
+
+    printf("roots: ");
+    for (unsigned int i = 0; i < rs->min_distance; i++) {
+        printf("%d", rs->generator_roots[i]);
+        if (i < rs->min_distance - 1) {
+            printf(", ");
+        }
+    }
+    printf("\n\n");
+
+    printf("generator: ");
+    for (unsigned int i = 0; i < rs->generator.order + 1; i++) {
+        printf("%d*x^%d", rs->generator.coeff[i], i);
+        if (i < rs->generator.order) {
+            printf(" + ");
+        }
+    }
+    printf("\n\n");
+
+    printf("generator (alpha format): ");
+    for (unsigned int i = rs->generator.order + 1; i > 0; i--) {
+        printf("alpha^%d*x^%d", rs->field.log[rs->generator.coeff[i - 1]], i - 1);
+        if (i > 1) {
+            printf(" + ");
+        }
+    }
+    printf("\n\n");
+
+    printf("remainder: ");
+    bool has_printed = false;
+    for (unsigned int i = 0; i < rs->encoded_remainder.order + 1; i++) {
+        if (!rs->encoded_remainder.coeff[i]) {
+            continue;
+        }
+        if (has_printed) {
+            printf(" + ");
+        }
+        has_printed = true;
+        printf("%d*x^%d", rs->encoded_remainder.coeff[i], i);
+    }
+    printf("\n\n");
+
+    printf("syndromes: ");
+    for (unsigned int i = 0; i < rs->min_distance; i++) {
+        printf("%d", rs->syndromes[i]);
+        if (i < rs->min_distance - 1) {
+            printf(", ");
+        }
+    }
+    printf("\n\n");
+
+    printf("numerrors: %d\n\n", rs->error_locator.order);
+
+    printf("error locator: ");
+    has_printed = false;
+    for (unsigned int i = 0; i < rs->error_locator.order + 1; i++) {
+        if (!rs->error_locator.coeff[i]) {
+            continue;
+        }
+        if (has_printed) {
+            printf(" + ");
+        }
+        has_printed = true;
+        printf("%d*x^%d", rs->error_locator.coeff[i], i);
+    }
+    printf("\n\n");
+
+    printf("error roots: ");
+    for (unsigned int i = 0; i < rs->error_locator.order; i++) {
+        printf("%d@%d", polynomial_eval(rs->field, rs->error_locator, rs->error_roots[i]), rs->error_roots[i]);
+        if (i < rs->error_locator.order - 1) {
+            printf(", ");
+        }
+    }
+    printf("\n\n");
+
+    printf("error evaluator: ");
+    has_printed = false;
+    for (unsigned int i = 0; i < rs->error_evaluator.order; i++) {
+        if (!rs->error_evaluator.coeff[i]) {
+            continue;
+        }
+        if (has_printed) {
+            printf(" + ");
+        }
+        has_printed = true;
+        printf("%d*x^%d", rs->error_evaluator.coeff[i], i);
+    }
+    printf("\n\n");
+
+    printf("error locator derivative: ");
+    has_printed = false;
+    for (unsigned int i = 0; i < rs->error_locator_derivative.order; i++) {
+        if (!rs->error_locator_derivative.coeff[i]) {
+            continue;
+        }
+        if (has_printed) {
+            printf(" + ");
+        }
+        has_printed = true;
+        printf("%d*x^%d", rs->error_locator_derivative.coeff[i], i);
+    }
+    printf("\n\n");
+
+    printf("error locator: ");
+    for (unsigned int i = 0; i < rs->error_locator.order; i++) {
+        printf("%d@%d", rs->error_vals[i], rs->error_locations[i]);
+        if (i < rs->error_locator.order - 1) {
+            printf(", ");
+        }
+    }
+    printf("\n\n");
+}
--- a/core/libcorrect/tests/CMakeLists.txt
+++ b/core/libcorrect/tests/CMakeLists.txt
@@ -0,0 +1,54 @@
+include_directories("include")
+
+
+add_executable(convolutional_test_runner EXCLUDE_FROM_ALL convolutional.c $<TARGET_OBJECTS:error_sim>)
+target_link_libraries(convolutional_test_runner correct_static "${LIBM}")
+set_target_properties(convolutional_test_runner PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
+add_test(NAME convolutional_test WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/tests" COMMAND convolutional_test_runner)
+set(all_test_runners ${all_test_runners} convolutional_test_runner)
+
+if(HAVE_SSE)
+    add_executable(convolutional_sse_test_runner EXCLUDE_FROM_ALL convolutional-sse.c $<TARGET_OBJECTS:error_sim_sse>)
+    target_link_libraries(convolutional_sse_test_runner correct_static "${LIBM}")
+    set_target_properties(convolutional_sse_test_runner PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
+    add_test(NAME convolutional_sse_test WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/tests" COMMAND convolutional_sse_test_runner)
+    set(all_test_runners ${all_test_runners} convolutional_sse_test_runner)
+endif()
+
+if(HAVE_LIBFEC)
+    add_executable(convolutional_fec_test_runner EXCLUDE_FROM_ALL convolutional-fec.c $<TARGET_OBJECTS:error_sim_fec>)
+    target_link_libraries(convolutional_fec_test_runner correct_static FEC "${LIBM}")
+    set_target_properties(convolutional_fec_test_runner PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
+    add_test(NAME convolutional_fec_test WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/tests" COMMAND convolutional_fec_test_runner)
+    set(all_test_runners ${all_test_runners} convolutional_fec_test_runner)
+endif()
+
+add_executable(convolutional_shim_test_runner EXCLUDE_FROM_ALL convolutional-shim.c $<TARGET_OBJECTS:error_sim_shim>)
+target_link_libraries(convolutional_shim_test_runner correct_static fec_shim_static "${LIBM}")
+set_target_properties(convolutional_shim_test_runner PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
+add_test(NAME convolutional_shim_test WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/tests" COMMAND convolutional_shim_test_runner)
+set(all_test_runners ${all_test_runners} convolutional_shim_test_runner)
+
+add_executable(reed_solomon_test_runner EXCLUDE_FROM_ALL reed-solomon.c rs_tester.c)
+target_link_libraries(reed_solomon_test_runner correct_static "${LIBM}")
+set_target_properties(reed_solomon_test_runner PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
+add_test(NAME reed_solomon_test WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/tests" COMMAND reed_solomon_test_runner)
+set(all_test_runners ${all_test_runners} reed_solomon_test_runner)
+
+if(HAVE_LIBFEC)
+    add_executable(reed_solomon_interop_test_runner EXCLUDE_FROM_ALL reed-solomon-fec-interop.c rs_tester.c rs_tester_fec.c)
+    target_link_libraries(reed_solomon_interop_test_runner correct_static FEC "${LIBM}")
+    set_target_properties(reed_solomon_interop_test_runner PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
+    add_test(NAME reed_solomon_interop_test WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/tests" COMMAND reed_solomon_interop_test_runner)
+    set(all_test_runners ${all_test_runners} reed_solomon_interop_test_runner)
+endif()
+
+add_executable(reed_solomon_shim_interop_test_runner EXCLUDE_FROM_ALL reed-solomon-shim-interop.c rs_tester.c rs_tester_fec_shim.c)
+target_link_libraries(reed_solomon_shim_interop_test_runner correct_static fec_shim_static "${LIBM}")
+set_target_properties(reed_solomon_shim_interop_test_runner PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
+add_test(NAME reed_solomon_shim_interop_test WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/tests" COMMAND reed_solomon_shim_interop_test_runner)
+set(all_test_runners ${all_test_runners} reed_solomon_shim_interop_test_runner)
+
+add_custom_target(test_runners DEPENDS ${all_test_runners})
+add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} DEPENDS test_runners)
+enable_testing()
--- a/core/libcorrect/tests/convolutional-fec.c
+++ b/core/libcorrect/tests/convolutional-fec.c
@@ -0,0 +1,123 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include <fec.h>
+
+#include "correct.h"
+#include "correct/util/error-sim-fec.h"
+
+size_t max_block_len = 4096;
+
+size_t test_conv(correct_convolutional *conv, void *fec,
+                 void (*decode)(void *, uint8_t *, size_t, uint8_t *),
+                 conv_testbench **testbench_ptr, size_t msg_len, double eb_n0,
+                 double bpsk_bit_energy, double bpsk_voltage) {
+    uint8_t *msg = malloc(max_block_len);
+
+    size_t num_errors = 0;
+
+    while (msg_len) {
+        size_t block_len = (max_block_len < msg_len) ? max_block_len : msg_len;
+        msg_len -= block_len;
+
+        for (unsigned int j = 0; j < block_len; j++) {
+            msg[j] = rand() % 256;
+        }
+
+        *testbench_ptr =
+            resize_conv_testbench(*testbench_ptr, conv_correct_enclen, conv, block_len);
+        conv_testbench *testbench = *testbench_ptr;
+        testbench->encoder = conv;
+        testbench->encode = conv_correct_encode;
+        testbench->decoder = fec;
+        testbench->decode = decode;
+        build_white_noise(testbench->noise, testbench->enclen, eb_n0, bpsk_bit_energy);
+        num_errors += test_conv_noise(testbench, msg, block_len, bpsk_voltage);
+    }
+    free(msg);
+    return num_errors;
+}
+
+void assert_test_result(correct_convolutional *conv, void *fec,
+                        void (*decode)(void *, uint8_t *, size_t, uint8_t *),
+                        conv_testbench **testbench, size_t test_length, size_t rate, size_t order,
+                        double eb_n0, double error_rate) {
+    double bpsk_voltage = 1.0 / sqrt(2.0);
+    double bpsk_sym_energy = 2 * pow(bpsk_voltage, 2.0);
+    double bpsk_bit_energy = bpsk_sym_energy * rate;
+
+    size_t error_count =
+        test_conv(conv, fec, decode, testbench, test_length, eb_n0, bpsk_bit_energy, bpsk_voltage);
+    double observed_error_rate = error_count / ((double)test_length * 8);
+    if (observed_error_rate > error_rate) {
+        printf(
+            "test failed, expected error rate=%.2e, observed error rate=%.2e @%.1fdB for rate %zu "
+            "order %zu\n",
+            error_rate, observed_error_rate, eb_n0, rate, order);
+        exit(1);
+    } else {
+        printf(
+            "test passed, expected error rate=%.2e, observed error rate=%.2e @%.1fdB for rate %zu "
+            "order %zu\n",
+            error_rate, observed_error_rate, eb_n0, rate, order);
+    }
+}
+
+int main() {
+    srand(time(NULL));
+
+    conv_testbench *testbench = NULL;
+
+    correct_convolutional *conv;
+    void *fec;
+    uint16_t *poly;
+
+    poly = (uint16_t[]){V27POLYA, V27POLYB};
+    conv = correct_convolutional_create(2, 7, poly);
+    fec = create_viterbi27(8 * max_block_len);
+    assert_test_result(conv, fec, conv_fec27_decode, &testbench, 1000000, 2, 6, INFINITY, 0);
+    assert_test_result(conv, fec, conv_fec27_decode, &testbench, 1000000, 2, 6, 4.5, 8e-06);
+    assert_test_result(conv, fec, conv_fec27_decode, &testbench, 1000000, 2, 6, 4.0, 5e-05);
+    delete_viterbi27(fec);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    poly = (uint16_t[]){V29POLYA, V29POLYB};
+    conv = correct_convolutional_create(2, 9, poly);
+    fec = create_viterbi29(8 * max_block_len);
+    assert_test_result(conv, fec, conv_fec29_decode, &testbench, 1000000, 2, 9, INFINITY, 0);
+    assert_test_result(conv, fec, conv_fec29_decode, &testbench, 1000000, 2, 9, 4.5, 3e-06);
+    assert_test_result(conv, fec, conv_fec29_decode, &testbench, 1000000, 2, 9, 4.0, 8e-06);
+    delete_viterbi29(fec);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    poly = (uint16_t[]){V39POLYA, V39POLYB, V39POLYC};
+    conv = correct_convolutional_create(3, 9, poly);
+    fec = create_viterbi39(8 * max_block_len);
+    assert_test_result(conv, fec, conv_fec39_decode, &testbench, 1000000, 3, 9, INFINITY, 0);
+    assert_test_result(conv, fec, conv_fec39_decode, &testbench, 1000000, 3, 9, 4.5, 3e-06);
+    assert_test_result(conv, fec, conv_fec39_decode, &testbench, 1000000, 3, 9, 4.0, 5e-06);
+    delete_viterbi39(fec);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    poly = (uint16_t[]){V615POLYA, V615POLYB, V615POLYC, V615POLYD, V615POLYE, V615POLYF};
+    conv = correct_convolutional_create(6, 15, poly);
+    fec = create_viterbi615(8 * max_block_len);
+    assert_test_result(conv, fec, conv_fec615_decode, &testbench, 100000, 6, 15, INFINITY, 0);
+    assert_test_result(conv, fec, conv_fec615_decode, &testbench, 100000, 6, 15, 3.0, 3e-06);
+    assert_test_result(conv, fec, conv_fec615_decode, &testbench, 100000, 6, 15, 2.5, 1e-05);
+    delete_viterbi615(fec);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    free_scratch(testbench);
+    return 0;
+}
--- a/core/libcorrect/tests/convolutional-shim.c
+++ b/core/libcorrect/tests/convolutional-shim.c
@@ -0,0 +1,122 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "correct.h"
+#include "fec_shim.h"
+#include "correct/util/error-sim-shim.h"
+
+size_t max_block_len = 4096;
+
+size_t test_conv(correct_convolutional *conv, void *fec,
+                 ssize_t (*decode)(void *, uint8_t *, size_t, uint8_t *),
+                 conv_testbench **testbench_ptr, size_t msg_len, double eb_n0,
+                 double bpsk_bit_energy, double bpsk_voltage) {
+    uint8_t *msg = malloc(max_block_len);
+
+    size_t num_errors = 0;
+
+    while (msg_len) {
+        size_t block_len = (max_block_len < msg_len) ? max_block_len : msg_len;
+        msg_len -= block_len;
+
+        for (unsigned int j = 0; j < block_len; j++) {
+            msg[j] = rand() % 256;
+        }
+
+        *testbench_ptr =
+            resize_conv_testbench(*testbench_ptr, conv_correct_enclen, conv, block_len);
+        conv_testbench *testbench = *testbench_ptr;
+        testbench->encoder = conv;
+        testbench->encode = conv_correct_encode;
+        testbench->decoder = fec;
+        testbench->decode = decode;
+        build_white_noise(testbench->noise, testbench->enclen, eb_n0, bpsk_bit_energy);
+        num_errors += test_conv_noise(testbench, msg, block_len, bpsk_voltage);
+    }
+    free(msg);
+    return num_errors;
+}
+
+void assert_test_result(correct_convolutional *conv, void *fec,
+                        ssize_t (*decode)(void *, uint8_t *, size_t, uint8_t *),
+                        conv_testbench **testbench, size_t test_length, size_t rate, size_t order,
+                        double eb_n0, double error_rate) {
+    double bpsk_voltage = 1.0 / sqrt(2.0);
+    double bpsk_sym_energy = 2 * pow(bpsk_voltage, 2.0);
+    double bpsk_bit_energy = bpsk_sym_energy * rate;
+
+    size_t error_count =
+        test_conv(conv, fec, decode, testbench, test_length, eb_n0, bpsk_bit_energy, bpsk_voltage);
+    double observed_error_rate = error_count / ((double)test_length * 8);
+    if (observed_error_rate > error_rate) {
+        printf(
+            "test failed, expected error rate=%.2e, observed error rate=%.2e @%.1fdB for rate %zu "
+            "order %zu\n",
+            error_rate, observed_error_rate, eb_n0, rate, order);
+        exit(1);
+    } else {
+        printf(
+            "test passed, expected error rate=%.2e, observed error rate=%.2e @%.1fdB for rate %zu "
+            "order %zu\n",
+            error_rate, observed_error_rate, eb_n0, rate, order);
+    }
+}
+
+int main() {
+    srand(time(NULL));
+
+    conv_testbench *testbench = NULL;
+
+    correct_convolutional *conv;
+    void *fec;
+    uint16_t *poly;
+
+    poly = (uint16_t[]){V27POLYA, V27POLYB};
+    conv = correct_convolutional_create(2, 7, poly);
+    fec = create_viterbi27(8 * max_block_len);
+    assert_test_result(conv, fec, conv_shim27_decode, &testbench, 1000000, 2, 6, INFINITY, 0);
+    assert_test_result(conv, fec, conv_shim27_decode, &testbench, 1000000, 2, 6, 4.5, 8e-06);
+    assert_test_result(conv, fec, conv_shim27_decode, &testbench, 1000000, 2, 6, 4.0, 5e-05);
+    delete_viterbi27(fec);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    poly = (uint16_t[]){V29POLYA, V29POLYB};
+    conv = correct_convolutional_create(2, 9, poly);
+    fec = create_viterbi29(8 * max_block_len);
+    assert_test_result(conv, fec, conv_shim29_decode, &testbench, 1000000, 2, 9, INFINITY, 0);
+    assert_test_result(conv, fec, conv_shim29_decode, &testbench, 1000000, 2, 9, 4.5, 3e-06);
+    assert_test_result(conv, fec, conv_shim29_decode, &testbench, 1000000, 2, 9, 4.0, 8e-06);
+    delete_viterbi29(fec);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    poly = (uint16_t[]){V39POLYA, V39POLYB, V39POLYC};
+    conv = correct_convolutional_create(3, 9, poly);
+    fec = create_viterbi39(8 * max_block_len);
+    assert_test_result(conv, fec, conv_shim39_decode, &testbench, 1000000, 3, 9, INFINITY, 0);
+    assert_test_result(conv, fec, conv_shim39_decode, &testbench, 1000000, 3, 9, 4.5, 3e-06);
+    assert_test_result(conv, fec, conv_shim39_decode, &testbench, 1000000, 3, 9, 4.0, 9e-06);
+    delete_viterbi39(fec);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    poly = (uint16_t[]){V615POLYA, V615POLYB, V615POLYC, V615POLYD, V615POLYE, V615POLYF};
+    conv = correct_convolutional_create(6, 15, poly);
+    fec = create_viterbi615(8 * max_block_len);
+    assert_test_result(conv, fec, conv_shim615_decode, &testbench, 100000, 6, 15, INFINITY, 0);
+    assert_test_result(conv, fec, conv_shim615_decode, &testbench, 100000, 6, 15, 3.0, 2e-05);
+    assert_test_result(conv, fec, conv_shim615_decode, &testbench, 100000, 6, 15, 2.5, 4e-05);
+    delete_viterbi615(fec);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    free_scratch(testbench);
+    return 0;
+}
--- a/core/libcorrect/tests/convolutional-sse.c
+++ b/core/libcorrect/tests/convolutional-sse.c
@@ -0,0 +1,132 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "correct/util/error-sim-sse.h"
+
+size_t max_block_len = 4096;
+
+size_t test_conv(correct_convolutional_sse *conv, conv_testbench **testbench_ptr,
+               size_t msg_len, double eb_n0, double bpsk_bit_energy,
+               double bpsk_voltage) {
+    uint8_t *msg = malloc(max_block_len);
+
+    size_t num_errors = 0;
+
+    while (msg_len) {
+        size_t block_len = (max_block_len < msg_len) ? max_block_len : msg_len;
+        msg_len -= block_len;
+
+        for (unsigned int j = 0; j < block_len; j++) {
+            msg[j] = rand() % 256;
+        }
+
+        *testbench_ptr = resize_conv_testbench(*testbench_ptr, conv_correct_sse_enclen, conv, block_len);
+        conv_testbench *testbench = *testbench_ptr;
+        testbench->encoder = conv;
+        testbench->encode = conv_correct_sse_encode;
+        testbench->decoder = conv;
+        testbench->decode = conv_correct_sse_decode;
+        build_white_noise(testbench->noise, testbench->enclen, eb_n0, bpsk_bit_energy);
+        num_errors += test_conv_noise(testbench, msg, block_len, bpsk_voltage);
+    }
+    free(msg);
+    return num_errors;
+}
+
+void assert_test_result(correct_convolutional_sse *conv, conv_testbench **testbench,
+                        size_t test_length, size_t rate, size_t order, double eb_n0, double error_rate) {
+    double bpsk_voltage = 1.0/sqrt(2.0);
+    double bpsk_sym_energy = 2*pow(bpsk_voltage, 2.0);
+    double bpsk_bit_energy = bpsk_sym_energy * rate;
+
+    size_t error_count = test_conv(conv, testbench, test_length, eb_n0, bpsk_bit_energy, bpsk_voltage);
+    double observed_error_rate = error_count/((double)test_length * 8);
+    if (observed_error_rate > error_rate) {
+        printf("test failed, expected error rate=%.2e, observed error rate=%.2e @%.1fdB for rate %zu order %zu\n",
+                error_rate, observed_error_rate, eb_n0, rate, order);
+        exit(1);
+    } else {
+        printf("test passed, expected error rate=%.2e, observed error rate=%.2e @%.1fdB for rate %zu order %zu\n",
+                error_rate, observed_error_rate, eb_n0, rate, order);
+    }
+}
+
+int main() {
+    srand(time(NULL));
+
+    conv_testbench *testbench = NULL;
+
+    correct_convolutional_sse *conv;
+
+    // n.b. the error rates below are at 5.0dB/4.5dB for order 6 polys
+    //  and 4.5dB/4.0dB for order 7-9 polys. this can be easy to miss.
+
+    conv = correct_convolutional_sse_create(2, 6, correct_conv_r12_6_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 2, 6, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 2, 6, 5.0, 8e-06);
+    assert_test_result(conv, &testbench, 1000000, 2, 6, 4.5, 3e-05);
+    correct_convolutional_sse_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_sse_create(2, 7, correct_conv_r12_7_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 2, 7, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 2, 7, 4.5, 1e-05);
+    assert_test_result(conv, &testbench, 1000000, 2, 7, 4.0, 5e-05);
+    correct_convolutional_sse_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_sse_create(2, 8, correct_conv_r12_8_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 2, 8, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 2, 8, 4.5, 5e-06);
+    assert_test_result(conv, &testbench, 1000000, 2, 8, 4.0, 3e-05);
+    correct_convolutional_sse_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_sse_create(2, 9, correct_conv_r12_9_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 2, 9, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 2, 9, 4.5, 3e-06);
+    assert_test_result(conv, &testbench, 1000000, 2, 9, 4.0, 8e-06);
+    correct_convolutional_sse_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_sse_create(3, 6, correct_conv_r13_6_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 3, 6, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 3, 6, 5.0, 5e-06);
+    assert_test_result(conv, &testbench, 1000000, 3, 6, 4.5, 2e-05);
+    correct_convolutional_sse_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_sse_create(3, 7, correct_conv_r13_7_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 3, 7, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 3, 7, 4.5, 5e-06);
+    assert_test_result(conv, &testbench, 1000000, 3, 7, 4.0, 3e-05);
+    correct_convolutional_sse_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_sse_create(3, 8, correct_conv_r13_8_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 3, 8, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 3, 8, 4.5, 4e-06);
+    assert_test_result(conv, &testbench, 1000000, 3, 8, 4.0, 1e-05);
+    correct_convolutional_sse_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_sse_create(3, 9, correct_conv_r13_9_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 3, 9, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 3, 9, 4.5, 3e-06);
+    assert_test_result(conv, &testbench, 1000000, 3, 9, 4.0, 5e-06);
+    correct_convolutional_sse_destroy(conv);
+
+    printf("\n");
+
+    free_scratch(testbench);
+    return 0;
+}
--- a/core/libcorrect/tests/convolutional.c
+++ b/core/libcorrect/tests/convolutional.c
@@ -0,0 +1,133 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "correct.h"
+#include "correct/util/error-sim.h"
+
+size_t max_block_len = 4096;
+
+size_t test_conv(correct_convolutional *conv, conv_testbench **testbench_ptr,
+               size_t msg_len, double eb_n0, double bpsk_bit_energy,
+               double bpsk_voltage) {
+    uint8_t *msg = malloc(max_block_len);
+
+    size_t num_errors = 0;
+
+    while (msg_len) {
+        size_t block_len = (max_block_len < msg_len) ? max_block_len : msg_len;
+        msg_len -= block_len;
+
+        for (unsigned int j = 0; j < block_len; j++) {
+            msg[j] = rand() % 256;
+        }
+
+        *testbench_ptr = resize_conv_testbench(*testbench_ptr, conv_correct_enclen, conv, block_len);
+        conv_testbench *testbench = *testbench_ptr;
+        testbench->encoder = conv;
+        testbench->encode = conv_correct_encode;
+        testbench->decoder = conv;
+        testbench->decode = conv_correct_decode;
+        build_white_noise(testbench->noise, testbench->enclen, eb_n0, bpsk_bit_energy);
+        num_errors += test_conv_noise(testbench, msg, block_len, bpsk_voltage);
+    }
+    free(msg);
+    return num_errors;
+}
+
+void assert_test_result(correct_convolutional *conv, conv_testbench **testbench,
+                        size_t test_length, size_t rate, size_t order, double eb_n0, double error_rate) {
+    double bpsk_voltage = 1.0/sqrt(2.0);
+    double bpsk_sym_energy = 2*pow(bpsk_voltage, 2.0);
+    double bpsk_bit_energy = bpsk_sym_energy * rate;
+
+    size_t error_count = test_conv(conv, testbench, test_length, eb_n0, bpsk_bit_energy, bpsk_voltage);
+    double observed_error_rate = error_count/((double)test_length * 8);
+    if (observed_error_rate > error_rate) {
+        printf("test failed, expected error rate=%.2e, observed error rate=%.2e @%.1fdB for rate %zu order %zu\n",
+                error_rate, observed_error_rate, eb_n0, rate, order);
+        exit(1);
+    } else {
+        printf("test passed, expected error rate=%.2e, observed error rate=%.2e @%.1fdB for rate %zu order %zu\n",
+                error_rate, observed_error_rate, eb_n0, rate, order);
+    }
+}
+
+int main() {
+    srand(time(NULL));
+
+    conv_testbench *testbench = NULL;
+
+    correct_convolutional *conv;
+
+    // n.b. the error rates below are at 5.0dB/4.5dB for order 6 polys
+    //  and 4.5dB/4.0dB for order 7-9 polys. this can be easy to miss.
+
+    conv = correct_convolutional_create(2, 6, correct_conv_r12_6_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 2, 6, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 2, 6, 5.0, 5e-06);
+    assert_test_result(conv, &testbench, 1000000, 2, 6, 4.5, 3e-05);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_create(2, 7, correct_conv_r12_7_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 2, 7, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 2, 7, 4.5, 1e-05);
+    assert_test_result(conv, &testbench, 1000000, 2, 7, 4.0, 5e-05);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_create(2, 8, correct_conv_r12_8_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 2, 8, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 2, 8, 4.5, 5e-06);
+    assert_test_result(conv, &testbench, 1000000, 2, 8, 4.0, 3e-05);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_create(2, 9, correct_conv_r12_9_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 2, 9, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 2, 9, 4.5, 3e-06);
+    assert_test_result(conv, &testbench, 1000000, 2, 9, 4.0, 1e-05);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_create(3, 6, correct_conv_r13_6_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 3, 6, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 3, 6, 5.0, 5e-06);
+    assert_test_result(conv, &testbench, 1000000, 3, 6, 4.5, 2e-05);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_create(3, 7, correct_conv_r13_7_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 3, 7, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 3, 7, 4.5, 5e-06);
+    assert_test_result(conv, &testbench, 1000000, 3, 7, 4.0, 3e-05);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_create(3, 8, correct_conv_r13_8_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 3, 8, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 3, 8, 4.5, 4e-06);
+    assert_test_result(conv, &testbench, 1000000, 3, 8, 4.0, 1e-05);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    conv = correct_convolutional_create(3, 9, correct_conv_r13_9_polynomial);
+    assert_test_result(conv, &testbench, 1000000, 3, 9, INFINITY, 0);
+    assert_test_result(conv, &testbench, 1000000, 3, 9, 4.5, 3e-06);
+    assert_test_result(conv, &testbench, 1000000, 3, 9, 4.0, 5e-06);
+    correct_convolutional_destroy(conv);
+
+    printf("\n");
+
+    free_scratch(testbench);
+    return 0;
+}
--- a/core/libcorrect/tests/include/rs_tester.h
+++ b/core/libcorrect/tests/include/rs_tester.h
@@ -0,0 +1,41 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <time.h>
+
+#include "correct.h"
+
+void rs_correct_encode(void *encoder, uint8_t *msg, size_t msg_length,
+                       uint8_t *msg_out);
+void rs_correct_decode(void *decoder, uint8_t *encoded, size_t encoded_length,
+                       uint8_t *erasure_locations, size_t erasure_length,
+                       uint8_t *msg, size_t pad_length, size_t num_roots);
+
+typedef struct {
+    size_t block_length;
+    size_t message_length;
+    size_t min_distance;
+    unsigned char *msg;
+    uint8_t *encoded;
+    int *indices;
+    uint8_t *corrupted_encoded;
+    uint8_t *erasure_locations;
+    unsigned char *recvmsg;
+} rs_testbench;
+
+typedef struct {
+    void (*encode)(void *, uint8_t *, size_t, uint8_t *);
+    void *encoder;
+    void (*decode)(void *, uint8_t *, size_t, uint8_t *, size_t, uint8_t *, size_t, size_t);
+    void *decoder;
+} rs_test;
+
+rs_testbench *rs_testbench_create(size_t block_length, size_t min_distance);
+void rs_testbench_destroy(rs_testbench *testbench);
+
+typedef struct {
+    bool output_matches;
+} rs_test_run;
+
+rs_test_run test_rs_errors(rs_test *test, rs_testbench *testbench, size_t msg_length,
+                    size_t num_errors, size_t num_erasures);
--- a/core/libcorrect/tests/include/rs_tester_fec.h
+++ b/core/libcorrect/tests/include/rs_tester_fec.h
@@ -0,0 +1,10 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fec.h>
+void rs_fec_encode(void *encoder, uint8_t *msg, size_t msg_length,
+                   uint8_t *msg_out);
+void rs_fec_decode(void *decoder, uint8_t *encoded, size_t encoded_length,
+                   uint8_t *erasure_locations, size_t erasure_length,
+                   uint8_t *msg, size_t pad_length, size_t num_roots);
--- a/core/libcorrect/tests/include/rs_tester_fec_shim.h
+++ b/core/libcorrect/tests/include/rs_tester_fec_shim.h
@@ -0,0 +1,10 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <string.h>
+#include "fec_shim.h"
+void rs_fec_encode(void *encoder, uint8_t *msg, size_t msg_length,
+                   uint8_t *msg_out);
+void rs_fec_decode(void *decoder, uint8_t *encoded, size_t encoded_length,
+                   uint8_t *erasure_locations, size_t erasure_length,
+                   uint8_t *msg, size_t pad_length, size_t num_roots);
--- a/core/libcorrect/tests/reed-solomon-fec-interop.c
+++ b/core/libcorrect/tests/reed-solomon-fec-interop.c
@@ -0,0 +1,138 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include <time.h>
+
+#include "rs_tester.h"
+#include "rs_tester_fec.h"
+
+void print_test_type(size_t block_length, size_t message_length,
+                     size_t num_errors, size_t num_erasures) {
+    printf(
+        "testing reed solomon block length=%zu, message length=%zu, "
+        "errors=%zu, erasures=%zu...",
+        block_length, message_length, num_errors, num_erasures);
+}
+
+void fail_test() {
+    printf("FAILED\n");
+    exit(1);
+}
+
+void pass_test() { printf("PASSED\n"); }
+
+void run_tests(correct_reed_solomon *rs, void *fec_rs, rs_testbench *testbench,
+               size_t block_length, size_t test_msg_length, size_t num_errors,
+               size_t num_erasures, size_t num_iterations) {
+    // run both ways, correct->fec and fec->correct
+    rs_test test;
+    test.encode = rs_correct_encode;
+    test.encoder = rs;
+    test.decode = rs_fec_decode;
+    test.decoder = fec_rs;
+
+    print_test_type(block_length, test_msg_length, num_errors, num_erasures);
+    for (size_t i = 0; i < num_iterations; i++) {
+        rs_test_run run = test_rs_errors(&test, testbench, test_msg_length,
+                                         num_errors, num_erasures);
+        if (!run.output_matches) {
+            fail_test();
+        }
+    }
+
+    test.encode = rs_fec_encode;
+    test.encoder = fec_rs;
+    test.decode = rs_correct_decode;
+    test.decoder = rs;
+    for (size_t i = 0; i < num_iterations; i++) {
+        rs_test_run run = test_rs_errors(&test, testbench, test_msg_length,
+                                         num_errors, num_erasures);
+        if (!run.output_matches) {
+            fail_test();
+        }
+    }
+    pass_test();
+}
+
+int main() {
+    srand(time(NULL));
+
+    size_t block_length = 255;
+    size_t min_distance = 32;
+    size_t message_length = block_length - min_distance;
+
+    size_t pad_length;
+    void *fec_rs;
+
+    correct_reed_solomon *rs = correct_reed_solomon_create(
+        correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance);
+    rs_testbench *testbench = rs_testbench_create(block_length, min_distance);
+
+    pad_length = message_length / 2;
+    fec_rs = init_rs_char(8, correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance,
+                          pad_length);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 2, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, min_distance, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 4, min_distance / 2, 20000);
+    free_rs_char(fec_rs);
+
+    pad_length = 0;
+    fec_rs = init_rs_char(8, correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance,
+                          pad_length);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 2, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, min_distance, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 4, min_distance / 2, 20000);
+    free_rs_char(fec_rs);
+
+    rs_testbench_destroy(testbench);
+    correct_reed_solomon_destroy(rs);
+
+    min_distance = 16;
+    message_length = block_length - min_distance;
+    rs = correct_reed_solomon_create(
+        correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance);
+    testbench = rs_testbench_create(block_length, min_distance);
+
+    pad_length = message_length / 2;
+    fec_rs = init_rs_char(8, correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance,
+                          pad_length);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 2, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, min_distance, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 4, min_distance / 2, 20000);
+    free_rs_char(fec_rs);
+
+    pad_length = 0;
+    fec_rs = init_rs_char(8, correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance,
+                          pad_length);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 2, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, min_distance, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 4, min_distance / 2, 20000);
+    free_rs_char(fec_rs);
+
+    rs_testbench_destroy(testbench);
+    correct_reed_solomon_destroy(rs);
+
+    printf("test passed\n");
+    return 0;
+}
--- a/core/libcorrect/tests/reed-solomon-shim-interop.c
+++ b/core/libcorrect/tests/reed-solomon-shim-interop.c
@@ -0,0 +1,138 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include <time.h>
+
+#include "rs_tester.h"
+#include "rs_tester_fec_shim.h"
+
+void print_test_type(size_t block_length, size_t message_length,
+                     size_t num_errors, size_t num_erasures) {
+    printf(
+        "testing reed solomon block length=%zu, message length=%zu, "
+        "errors=%zu, erasures=%zu...",
+        block_length, message_length, num_errors, num_erasures);
+}
+
+void fail_test() {
+    printf("FAILED\n");
+    exit(1);
+}
+
+void pass_test() { printf("PASSED\n"); }
+
+void run_tests(correct_reed_solomon *rs, void *fec_rs, rs_testbench *testbench,
+               size_t block_length, size_t test_msg_length, size_t num_errors,
+               size_t num_erasures, size_t num_iterations) {
+    // run both ways, correct->fec and fec->correct
+    rs_test test;
+    test.encode = rs_correct_encode;
+    test.encoder = rs;
+    test.decode = rs_fec_decode;
+    test.decoder = fec_rs;
+
+    print_test_type(block_length, test_msg_length, num_errors, num_erasures);
+    for (size_t i = 0; i < num_iterations; i++) {
+        rs_test_run run = test_rs_errors(&test, testbench, test_msg_length, num_errors,
+                                     num_erasures);
+        if (!run.output_matches) {
+            fail_test();
+        }
+    }
+
+    test.encode = rs_fec_encode;
+    test.encoder = fec_rs;
+    test.decode = rs_correct_decode;
+    test.decoder = rs;
+    for (size_t i = 0; i < num_iterations; i++) {
+        rs_test_run run = test_rs_errors(&test, testbench, test_msg_length, num_errors,
+                                     num_erasures);
+        if (!run.output_matches) {
+            fail_test();
+        }
+    }
+    pass_test();
+}
+
+int main() {
+    srand(time(NULL));
+
+    size_t block_length = 255;
+    size_t min_distance = 32;
+    size_t message_length = block_length - min_distance;
+
+    size_t pad_length;
+    void *fec_rs;
+
+    correct_reed_solomon *rs = correct_reed_solomon_create(
+        correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance);
+    rs_testbench *testbench = rs_testbench_create(block_length, min_distance);
+
+    pad_length = message_length / 2;
+    fec_rs = init_rs_char(8, correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance,
+                          pad_length);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 2, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, min_distance, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 4, min_distance / 2, 20000);
+    free_rs_char(fec_rs);
+
+    pad_length = 0;
+    fec_rs = init_rs_char(8, correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance,
+                          pad_length);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 2, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, min_distance, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 4, min_distance / 2, 20000);
+    free_rs_char(fec_rs);
+
+    rs_testbench_destroy(testbench);
+    correct_reed_solomon_destroy(rs);
+
+    min_distance = 16;
+    message_length = block_length - min_distance;
+    rs = correct_reed_solomon_create(
+        correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance);
+    testbench = rs_testbench_create(block_length, min_distance);
+
+    pad_length = message_length / 2;
+    fec_rs = init_rs_char(8, correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance,
+                          pad_length);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 2, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, min_distance, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 4, min_distance / 2, 20000);
+    free_rs_char(fec_rs);
+
+    pad_length = 0;
+    fec_rs = init_rs_char(8, correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance,
+                          pad_length);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 2, 0, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              0, min_distance, 20000);
+    run_tests(rs, fec_rs, testbench, block_length, message_length - pad_length,
+              min_distance / 4, min_distance / 2, 20000);
+    free_rs_char(fec_rs);
+
+    rs_testbench_destroy(testbench);
+    correct_reed_solomon_destroy(rs);
+
+    printf("test passed\n");
+    return 0;
+}
--- a/core/libcorrect/tests/reed-solomon.c
+++ b/core/libcorrect/tests/reed-solomon.c
@@ -0,0 +1,146 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdbool.h>
+#include <time.h>
+
+#include "rs_tester.h"
+
+void print_test_type(size_t block_length, size_t message_length,
+                     size_t num_errors, size_t num_erasures) {
+    printf(
+        "testing reed solomon block length=%zu, message length=%zu, "
+        "errors=%zu, erasures=%zu...",
+        block_length, message_length, num_errors, num_erasures);
+}
+
+void fail_test() {
+    printf("FAILED\n");
+    exit(1);
+}
+
+void pass_test() { printf("PASSED\n"); }
+
+void run_tests(correct_reed_solomon *rs, rs_testbench *testbench,
+               size_t block_length, size_t test_msg_length, size_t num_errors,
+               size_t num_erasures, size_t num_iterations) {
+    rs_test test;
+    test.encode = rs_correct_encode;
+    test.decode = rs_correct_decode;
+    test.encoder = rs;
+    test.decoder = rs;
+    print_test_type(block_length, test_msg_length, num_errors, num_erasures);
+    for (size_t i = 0; i < num_iterations; i++) {
+        rs_test_run run = test_rs_errors(&test, testbench, test_msg_length, num_errors,
+                                     num_erasures);
+        if (!run.output_matches) {
+            fail_test();
+        }
+    }
+    pass_test();
+}
+
+int main() {
+    srand(time(NULL));
+
+    size_t block_length = 255;
+    size_t min_distance = 32;
+    size_t message_length = block_length - min_distance;
+
+    correct_reed_solomon *rs = correct_reed_solomon_create(
+        correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance);
+    rs_testbench *testbench = rs_testbench_create(block_length, min_distance);
+
+    run_tests(rs, testbench, block_length, message_length / 2, 0, 0, 20000);
+    run_tests(rs, testbench, block_length, message_length, 0, 0, 20000);
+    run_tests(rs, testbench, block_length, message_length / 2, min_distance / 2,
+              0, 20000);
+    run_tests(rs, testbench, block_length, message_length, min_distance / 2, 0,
+              20000);
+    run_tests(rs, testbench, block_length, message_length / 2, 0, min_distance,
+              20000);
+    run_tests(rs, testbench, block_length, message_length, 0, min_distance,
+              20000);
+    run_tests(rs, testbench, block_length, message_length / 2, min_distance / 4,
+              min_distance / 2, 20000);
+    run_tests(rs, testbench, block_length, message_length, min_distance / 4,
+              min_distance / 2, 20000);
+
+    rs_testbench_destroy(testbench);
+    correct_reed_solomon_destroy(rs);
+
+    min_distance = 16;
+    message_length = block_length - min_distance;
+    rs = correct_reed_solomon_create(
+        correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance);
+    testbench = rs_testbench_create(block_length, min_distance);
+
+    run_tests(rs, testbench, block_length, message_length / 2, 0, 0, 20000);
+    run_tests(rs, testbench, block_length, message_length, 0, 0, 20000);
+    run_tests(rs, testbench, block_length, message_length / 2, min_distance / 2,
+              0, 20000);
+    run_tests(rs, testbench, block_length, message_length, min_distance / 2, 0,
+              20000);
+    run_tests(rs, testbench, block_length, message_length / 2, 0, min_distance,
+              20000);
+    run_tests(rs, testbench, block_length, message_length, 0, min_distance,
+              20000);
+    run_tests(rs, testbench, block_length, message_length / 2, min_distance / 4,
+              min_distance / 2, 20000);
+    run_tests(rs, testbench, block_length, message_length, min_distance / 4,
+              min_distance / 2, 20000);
+
+    rs_testbench_destroy(testbench);
+    correct_reed_solomon_destroy(rs);
+
+    min_distance = 8;
+    message_length = block_length - min_distance;
+    rs = correct_reed_solomon_create(
+        correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance);
+    testbench = rs_testbench_create(block_length, min_distance);
+
+    run_tests(rs, testbench, block_length, message_length / 2, 0, 0, 20000);
+    run_tests(rs, testbench, block_length, message_length, 0, 0, 20000);
+    run_tests(rs, testbench, block_length, message_length / 2, min_distance / 2,
+              0, 20000);
+    run_tests(rs, testbench, block_length, message_length, min_distance / 2, 0,
+              20000);
+    run_tests(rs, testbench, block_length, message_length / 2, 0, min_distance,
+              20000);
+    run_tests(rs, testbench, block_length, message_length, 0, min_distance,
+              20000);
+    run_tests(rs, testbench, block_length, message_length / 2, min_distance / 4,
+              min_distance / 2, 20000);
+    run_tests(rs, testbench, block_length, message_length, min_distance / 4,
+              min_distance / 2, 20000);
+
+    rs_testbench_destroy(testbench);
+    correct_reed_solomon_destroy(rs);
+
+    min_distance = 4;
+    message_length = block_length - min_distance;
+    rs = correct_reed_solomon_create(
+        correct_rs_primitive_polynomial_ccsds, 1, 1, min_distance);
+    testbench = rs_testbench_create(block_length, min_distance);
+
+    run_tests(rs, testbench, block_length, message_length / 2, 0, 0, 20000);
+    run_tests(rs, testbench, block_length, message_length, 0, 0, 20000);
+    run_tests(rs, testbench, block_length, message_length / 2, min_distance / 2,
+              0, 20000);
+    run_tests(rs, testbench, block_length, message_length, min_distance / 2, 0,
+              20000);
+    run_tests(rs, testbench, block_length, message_length / 2, 0, min_distance,
+              20000);
+    run_tests(rs, testbench, block_length, message_length, 0, min_distance,
+              20000);
+    run_tests(rs, testbench, block_length, message_length / 2, min_distance / 4,
+              min_distance / 2, 20000);
+    run_tests(rs, testbench, block_length, message_length, min_distance / 4,
+              min_distance / 2, 20000);
+
+    rs_testbench_destroy(testbench);
+    correct_reed_solomon_destroy(rs);
+
+    printf("test passed\n");
+    return 0;
+}
--- a/core/libcorrect/tests/rs_tester.c
+++ b/core/libcorrect/tests/rs_tester.c
@@ -0,0 +1,102 @@
+#include "rs_tester.h"
+
+void shuffle(int *a, size_t len) {
+    for (size_t i = 0; i < len - 2; i++) {
+        size_t j = rand() % (len - i) + i;
+        int temp = a[i];
+        a[i] = a[j];
+        a[j] = temp;
+    }
+}
+
+void rs_correct_encode(void *encoder, uint8_t *msg, size_t msg_length,
+                       uint8_t *msg_out) {
+    correct_reed_solomon_encode((correct_reed_solomon *)encoder, msg,
+                                msg_length, msg_out);
+}
+
+void rs_correct_decode(void *decoder, uint8_t *encoded, size_t encoded_length,
+                       uint8_t *erasure_locations, size_t erasure_length,
+                       uint8_t *msg, size_t pad_length, size_t num_roots) {
+    correct_reed_solomon_decode_with_erasures(
+        (correct_reed_solomon *)decoder, encoded, encoded_length,
+        erasure_locations, erasure_length, msg);
+}
+
+rs_testbench *rs_testbench_create(size_t block_length, size_t min_distance) {
+    rs_testbench *testbench = calloc(1, sizeof(rs_testbench));
+
+    size_t message_length = block_length - min_distance;
+    testbench->message_length = message_length;
+    testbench->block_length = block_length;
+    testbench->min_distance = min_distance;
+
+    testbench->msg = calloc(message_length, sizeof(unsigned char));
+    testbench->encoded = malloc(block_length * sizeof(uint8_t));
+
+    testbench->indices = malloc(block_length * sizeof(int));
+
+    testbench->corrupted_encoded = malloc(block_length * sizeof(uint8_t));
+    testbench->erasure_locations = malloc(min_distance * sizeof(uint8_t));
+    testbench->recvmsg = malloc(sizeof(unsigned char) * message_length);
+
+    return testbench;
+}
+
+void rs_testbench_destroy(rs_testbench *testbench) {
+    free(testbench->msg);
+    free(testbench->encoded);
+    free(testbench->indices);
+    free(testbench->corrupted_encoded);
+    free(testbench->erasure_locations);
+    free(testbench->recvmsg);
+    free(testbench);
+}
+
+rs_test_run test_rs_errors(rs_test *test, rs_testbench *testbench, size_t msg_length,
+                    size_t num_errors, size_t num_erasures) {
+    rs_test_run run;
+    run.output_matches = false;
+
+    if (msg_length > testbench->message_length) {
+        return run;
+    }
+
+    for (size_t i = 0; i < msg_length; i++) {
+        testbench->msg[i] = rand() % 256;
+    }
+
+    size_t block_length = msg_length + testbench->min_distance;
+    size_t pad_length = testbench->message_length - msg_length;
+
+    test->encode(test->encoder, testbench->msg, msg_length, testbench->encoded);
+
+    memcpy(testbench->corrupted_encoded, testbench->encoded, block_length);
+
+    for (int i = 0; i < block_length; i++) {
+        testbench->indices[i] = i;
+    }
+
+    shuffle(testbench->indices, block_length);
+
+    for (unsigned int i = 0; i < num_erasures; i++) {
+        int index = testbench->indices[i];
+        uint8_t corruption_mask = (rand() % 255) + 1;
+        testbench->corrupted_encoded[index] ^= corruption_mask;
+        testbench->erasure_locations[i] = index;
+    }
+
+    for (unsigned int i = 0; i < num_errors; i++) {
+        int index = testbench->indices[i + num_erasures];
+        uint8_t corruption_mask = (rand() % 255) + 1;
+        testbench->corrupted_encoded[index] ^= corruption_mask;
+    }
+
+    test->decode(test->decoder, testbench->corrupted_encoded, block_length,
+                 testbench->erasure_locations, num_erasures,
+                 testbench->recvmsg, pad_length, testbench->min_distance);
+
+    run.output_matches = (bool)(memcmp(testbench->msg, testbench->recvmsg, msg_length) == 0);
+
+    return run;
+}
--- a/core/libcorrect/tests/rs_tester_fec.c
+++ b/core/libcorrect/tests/rs_tester_fec.c
@@ -0,0 +1,30 @@
+#include "rs_tester_fec.h"
+void rs_fec_encode(void *encoder, uint8_t *msg, size_t msg_length,
+                   uint8_t *msg_out) {
+    // XXX make sure that pad length used to build encoder corresponds to this
+    // msg_length
+    memcpy(msg_out, msg, msg_length);
+    encode_rs_char(encoder, msg_out, msg_out + msg_length);
+}
+
+void rs_fec_decode(void *decoder, uint8_t *encoded, size_t encoded_length,
+                   uint8_t *erasure_locations, size_t erasure_length,
+                   uint8_t *msg, size_t pad_length, size_t num_roots) {
+    // XXX make sure that pad length used to build decoder corresponds to this
+    // encoded_length
+    if (erasure_length) {
+        static size_t locations_len = 0;
+        static int *locations = NULL;
+        if (locations_len < erasure_length) {
+            locations = realloc(locations, erasure_length * sizeof(int));
+            locations_len = erasure_length;
+        }
+        for (size_t i = 0; i < erasure_length; i++) {
+            locations[i] = (unsigned int)(erasure_locations[i]) + pad_length;
+        }
+        decode_rs_char(decoder, encoded, locations, erasure_length);
+    } else {
+        decode_rs_char(decoder, encoded, NULL, 0);
+    }
+    memcpy(msg, encoded, encoded_length - num_roots);
+}
--- a/core/libcorrect/tests/rs_tester_fec_shim.c
+++ b/core/libcorrect/tests/rs_tester_fec_shim.c
@@ -0,0 +1,26 @@
+#include "rs_tester_fec_shim.h"
+void rs_fec_encode(void *encoder, uint8_t *msg, size_t msg_length,
+                   uint8_t *msg_out) {
+    // XXX make sure that pad length used to build encoder corresponds to this
+    // msg_length
+    memcpy(msg_out, msg, msg_length);
+    encode_rs_char(encoder, msg_out, msg_out + msg_length);
+}
+
+void rs_fec_decode(void *decoder, uint8_t *encoded, size_t encoded_length,
+                   uint8_t *erasure_locations, size_t erasure_length,
+                   uint8_t *msg, size_t pad_length, size_t num_roots) {
+    // XXX make sure that pad length used to build decoder corresponds to this
+    // encoded_length
+    if (erasure_length) {
+        int *locations = malloc(erasure_length * sizeof(int));
+        for (size_t i = 0; i < erasure_length; i++) {
+            locations[i] = (unsigned int)(erasure_locations[i]) + pad_length;
+        }
+        decode_rs_char(decoder, encoded, locations, erasure_length);
+        free(locations);
+    } else {
+        decode_rs_char(decoder, encoded, NULL, 0);
+    }
+    memcpy(msg, encoded, encoded_length - num_roots);
+}
--- a/core/libcorrect/tools/CMakeLists.txt
+++ b/core/libcorrect/tools/CMakeLists.txt
@@ -0,0 +1,29 @@
+add_executable(rs_find_primitive_poly EXCLUDE_FROM_ALL find_rs_primitive_poly.c)
+target_link_libraries(rs_find_primitive_poly correct_static)
+set(all_tools ${all_tools} rs_find_primitive_poly)
+
+if(HAVE_LIBFEC)
+    add_executable(conv_find_libfec_poly EXCLUDE_FROM_ALL find_conv_libfec_poly.c)
+    target_link_libraries(conv_find_libfec_poly correct_static fec)
+    set(all_tools ${all_tools} conv_find_libfec_poly)
+endif()
+
+if(HAVE_SSE)
+    add_executable(conv_find_optim_poly EXCLUDE_FROM_ALL find_conv_optim_poly.c $<TARGET_OBJECTS:error_sim_sse>)
+    target_link_libraries(conv_find_optim_poly correct_static)
+    set(all_tools ${all_tools} conv_find_optim_poly)
+
+    add_executable(conv_find_optim_poly_annealing EXCLUDE_FROM_ALL find_conv_optim_poly_annealing.c $<TARGET_OBJECTS:error_sim_sse>)
+    target_link_libraries(conv_find_optim_poly_annealing correct_static)
+    set(all_tools ${all_tools} conv_find_optim_poly_annealing)
+else()
+    add_executable(conv_find_optim_poly EXCLUDE_FROM_ALL find_conv_optim_poly.c $<TARGET_OBJECTS:error_sim>)
+    target_link_libraries(conv_find_optim_poly correct_static)
+    set(all_tools ${all_tools} conv_find_optim_poly)
+
+    add_executable(conv_find_optim_poly_annealing EXCLUDE_FROM_ALL find_conv_optim_poly_annealing.c $<TARGET_OBJECTS:error_sim>)
+    target_link_libraries(conv_find_optim_poly_annealing correct_static)
+    set(all_tools ${all_tools} conv_find_optim_poly_annealing)
+endif()
+
+add_custom_target(tools DEPENDS ${all_tools})
--- a/core/libcorrect/tools/find_conv_libfec_poly.c
+++ b/core/libcorrect/tools/find_conv_libfec_poly.c
@@ -0,0 +1,279 @@
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <string.h>
+#include <time.h>
+#include <stddef.h>
+#include <assert.h>
+
+#include <correct.h>
+#include <fec.h>
+
+// this program allows us to find all of the polynomials that come with libfec
+// this way, we can provide compatibility with libfec-encoded streams and vice versa
+// we can do this without directly copy-pasting from libfec's source, thanks
+//   to this finder
+
+typedef struct {
+    void *vit;
+    int update_len;
+    int (*init)(void *, int);
+    int (*update)(void *, unsigned char *, int);
+    int (*chainback)(void *, unsigned char *, unsigned int, unsigned int);
+} libfec_decoder_t;
+
+void byte2bit(uint8_t *bytes, uint8_t *bits, size_t n_bits) {
+    unsigned char cmask = 0x80;
+    for (size_t i = 0; i < n_bits; i++) {
+        bits[i] = (bytes[i/8] & cmask) ? 255 : 0;
+        cmask >>= 1;
+        if (!cmask) {
+            cmask = 0x80;
+        }
+    }
+}
+
+correct_convolutional_polynomial_t *resize_poly_list(correct_convolutional_polynomial_t *polys, size_t cap) {
+    polys = realloc(polys, cap * sizeof(correct_convolutional_polynomial_t));
+    return polys;
+}
+
+void find_poly_coeff(size_t rate, size_t order, uint8_t *msg, size_t msg_len, libfec_decoder_t libfec, correct_convolutional_polynomial_t **polys_dest, size_t *polys_len, size_t search_coeff) {
+    // find a single coefficient of an unknown convolutional polynomial
+    // we are given a payload to encode, and we'll test all possible coefficients
+    //    to see which ones yield correct decodings by libfec, which has some
+    //    unknown polynomial "baked in"
+
+    // temp poly (this will be the one we search with)
+    correct_convolutional_polynomial_t *poly = malloc(rate * sizeof(correct_convolutional_polynomial_t));
+
+    // what's the largest coefficient value we'll test?
+    correct_convolutional_polynomial_t maxcoeff = (1 << order) - 1;
+
+    // note that we start about half way in
+    // this sum asks that we have the
+    //   a) highest order bit set
+    //   b) lowest order bit set
+    // we're only interested in coefficient values for which this is
+    //   true because if it weren't, the coefficient would actually be
+    //   of a smaller order than its supposed given order
+    correct_convolutional_polynomial_t startcoeff = (1 << (order - 1)) + 1;
+
+    // the values of this don't really matter except for the coeff we're searching for
+    // but just to be safe, we set them all
+    for (size_t i = 0; i < rate; i++) {
+        poly[i] = startcoeff;
+    }
+
+    // create a dummy encoder so that we can find how long the resulting encoded value is
+    correct_convolutional *conv_dummy = correct_convolutional_create(rate, order, poly);
+    size_t enclen_bits = correct_convolutional_encode_len(conv_dummy, msg_len);
+    size_t enclen = (enclen_bits % 8) ? (enclen_bits / 8 + 1) : enclen_bits / 8;
+    correct_convolutional_destroy(conv_dummy);
+
+    // compact encoded format (this comes from libcorrect)
+    uint8_t *encoded = malloc(enclen * sizeof(uint8_t));
+    // soft encoded format (this goes to libfec, one byte per bit)
+    uint8_t *encoded_bits = malloc(enclen * 8 * sizeof(uint8_t));
+    // resulting decoded message which we'll compare to our given payload
+    uint8_t *msg_cmp = malloc(msg_len * sizeof(uint8_t));
+
+    // we keep a list of coefficients which yielded correct decodings
+    // there could be 0, 1, or more than 1, and we'll return all of them
+    // we'll dynamically resize this as we go
+    size_t polys_cap = 1;
+    *polys_len = 0;
+    correct_convolutional_polynomial_t *polys = NULL;
+    polys = resize_poly_list(polys, polys_cap);
+
+    // iteration constants -- we go by 2 because we want the lowest order bit to
+    // stay set
+    for (correct_convolutional_polynomial_t i = startcoeff; i <= maxcoeff; i += 2) {
+        poly[search_coeff] = i;
+        correct_convolutional *conv = correct_convolutional_create(rate, order, poly);
+
+        correct_convolutional_encode(conv, (uint8_t*)msg, msg_len, encoded);
+        byte2bit(encoded, encoded_bits, enclen);
+
+        // now erase all the bits we're not searching for
+        for (size_t i = 0; i < msg_len * 8; i++) {
+            for (size_t j = 0; j < rate; j++) {
+                if (j != search_coeff) {
+                    // 128 is a soft erasure
+                    encoded_bits[i * rate + j] = 128;
+                }
+            }
+        }
+
+        libfec.init(libfec.vit, 0);
+        libfec.update(libfec.vit, encoded_bits, libfec.update_len);
+        libfec.chainback(libfec.vit, msg_cmp, 8 * msg_len, 0);
+
+        correct_convolutional_destroy(conv);
+
+        if (memcmp(msg_cmp, msg, msg_len) == 0) {
+            // match found
+
+            // resize list to make room
+            if (*polys_len == polys_cap) {
+                polys = resize_poly_list(polys, polys_cap * 2);
+                polys_cap *= 2;
+            }
+            polys[*polys_len] = i;
+            *polys_len = *polys_len + 1;
+        }
+    }
+
+    polys = resize_poly_list(polys, *polys_len);
+    *polys_dest = polys;
+    free(poly);
+    free(msg_cmp);
+    free(encoded);
+    free(encoded_bits);
+}
+
+// we choose 2 bytes because we need a payload that's longer than
+// the shift register under test. since that includes an order 15
+// s.r., we need at least 15 bits.
+size_t msg_len = 2;
+
+void find_poly(size_t rate, size_t order, libfec_decoder_t libfec, correct_convolutional_polynomial_t *poly) {
+    // find the complete set of coefficients that are "baked in" to
+    //   one particular method of libfec
+    // most of this method is described by find_poly_coeff
+
+    // for each coeff we want to find, we'll generate random 2-byte payloads and give
+    //   them to find_poly_coeff. If find_poly_coeff returns an empty list, we
+    //   try again. If it returns a nonempty list, then we find the intersection of
+    //   all the coefficient values find_poly_coeff has given us so far (we start
+    //   with the complete set). we are finished when only one coeff value remains
+
+    // we perform this process for each coeff e.g. 6 times for a rate 1/6 polynomial
+
+    uint8_t msg[msg_len];
+
+    // this is the list returned to us by find_poly_coeff
+    correct_convolutional_polynomial_t *polys;
+    // the list's length is written here
+    size_t polys_len;
+
+    printf("rate 1/%zu order %zu poly:", rate, order);
+
+    for (size_t search_coeff = 0; search_coeff < rate; search_coeff++) {
+        correct_convolutional_polynomial_t *fit = NULL;
+        size_t fit_len = 0;
+        size_t fit_cap = 0;
+        bool done = false;
+
+        while (!done) {
+            for (size_t i = 0; i < msg_len; i++) {
+                msg[i] = rand() % 256;
+            }
+            find_poly_coeff(rate, order, msg, msg_len, libfec, &polys, &polys_len, search_coeff);
+
+            if (polys_len == 0) {
+                // skip if none fit (this is a special case)
+                continue;
+            }
+
+            if (fit_len == 0) {
+                // the very first intersection
+                // we'll just copy the list handed to us
+                fit_cap = polys_len;
+                fit_len = polys_len;
+                fit = resize_poly_list(fit, fit_cap);
+                for (size_t i = 0; i < polys_len; i++) {
+                    fit[i] = polys[i];
+                }
+            } else {
+                // find intersection
+                ptrdiff_t polys_iter = 0;
+                ptrdiff_t fit_iter = 0;
+                ptrdiff_t new_fit_iter = 0;
+                // the lists generated by find_poly_coeff are sorted
+                // so we just retain the sorted property and walk both
+                while (polys_iter < polys_len && fit_iter < fit_len) {
+                    if (polys[polys_iter] < fit[fit_iter]) {
+                        polys_iter++;
+                    } else if (polys[polys_iter] > fit[fit_iter]) {
+                        fit_iter++;
+                    } else {
+                        fit[new_fit_iter] = fit[fit_iter];
+                        polys_iter++;
+                        fit_iter++;
+                        new_fit_iter++;
+                    }
+                }
+                // if new_fit_iter is 0 here then we don't intersect at all
+                // in this case we have to restart the search for this coeff
+                if (new_fit_iter != 0) {
+                    fit_len = new_fit_iter;
+                } else {
+                    free(fit);
+                    fit = NULL;
+                    fit_cap = 0;
+                    fit_len = 0;
+                }
+            }
+
+            free(polys);
+
+            if (fit_len == 1) {
+                poly[search_coeff] = fit[0];
+                if (order <= 9) {
+                    printf(" %04o", fit[0]);
+                } else {
+                    printf(" %06o", fit[0]);
+                }
+                done = true;
+            }
+        }
+
+        free(fit);
+    }
+    printf("\n");
+}
+
+int main() {
+    libfec_decoder_t libfec;
+
+    srand(time(NULL));
+
+    setbuf(stdout, NULL);
+
+    correct_convolutional_polynomial_t poly[6];
+
+    libfec.vit = create_viterbi27(8 * msg_len);
+    libfec.update_len = 8 * msg_len + 6;
+    libfec.init = init_viterbi27;
+    libfec.update = update_viterbi27_blk;
+    libfec.chainback = chainback_viterbi27;
+    find_poly(2, 7, libfec, poly);
+    delete_viterbi27(libfec.vit);
+
+    libfec.vit = create_viterbi29(8 * msg_len);
+    libfec.update_len = 8 * msg_len + 8;
+    libfec.init = init_viterbi29;
+    libfec.update = update_viterbi29_blk;
+    libfec.chainback = chainback_viterbi29;
+    find_poly(2, 9, libfec, poly);
+    delete_viterbi29(libfec.vit);
+
+    libfec.vit = create_viterbi39(8 * msg_len);
+    libfec.update_len = 8 * msg_len + 8;
+    libfec.init = init_viterbi39;
+    libfec.update = update_viterbi39_blk;
+    libfec.chainback = chainback_viterbi39;
+    find_poly(3, 9, libfec, poly);
+    delete_viterbi39(libfec.vit);
+
+    libfec.vit = create_viterbi615(8 * msg_len);
+    libfec.update_len = 8 * msg_len + 14;
+    libfec.init = init_viterbi615;
+    libfec.update = update_viterbi615_blk;
+    libfec.chainback = chainback_viterbi615;
+    find_poly(6, 15, libfec, poly);
+    delete_viterbi615(libfec.vit);
+
+    return 0;
+}
--- a/core/libcorrect/tools/find_conv_optim_poly.c
+++ b/core/libcorrect/tools/find_conv_optim_poly.c
@@ -0,0 +1,330 @@
+#include <stdbool.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <stddef.h>
+#include <limits.h>
+#include <pthread.h>
+
+#if HAVE_SSE
+#include "correct/util/error-sim-sse.h"
+typedef correct_convolutional_sse conv_t;
+static conv_t*(*conv_create)(size_t, size_t, const uint16_t *) = correct_convolutional_sse_create;
+static void(*conv_destroy)(conv_t *) = correct_convolutional_sse_destroy;
+static size_t(*conv_enclen)(void *, size_t) = conv_correct_sse_enclen;
+static void(*conv_encode)(void *, uint8_t *, size_t, uint8_t *) = conv_correct_sse_encode;
+static void(*conv_decode)(void *, uint8_t *, size_t, uint8_t *) = conv_correct_sse_decode;
+#else
+#include "correct/util/error-sim.h"
+typedef correct_convolutional conv_t;
+static conv_t*(*conv_create)(size_t, size_t, const uint16_t *) = correct_convolutional_create;
+static void(*conv_destroy)(conv_t *) = correct_convolutional_destroy;
+static size_t(*conv_enclen)(void *, size_t) = conv_correct_enclen;
+static void(*conv_encode)(void *, uint8_t *, size_t, uint8_t *) = conv_correct_encode;
+static void(*conv_decode)(void *, uint8_t *, size_t, uint8_t *) = conv_correct_decode;
+#endif
+
+typedef struct {
+    conv_t *conv;
+    correct_convolutional_polynomial_t *poly;
+} conv_tester_t;
+
+typedef struct {
+    int *distances;
+    float cost;
+    correct_convolutional_polynomial_t *poly;
+} conv_result_t;
+
+int compare_conv_results(const void *avoid, const void *bvoid) {
+    const conv_result_t *a = (const conv_result_t *)avoid;
+    const conv_result_t *b = (const conv_result_t *)bvoid;
+
+    if (a->cost > b->cost) {
+        return 1;
+    }
+    return -1;
+}
+
+typedef struct {
+    size_t rate;
+    size_t order;
+    conv_result_t *items;
+    size_t items_len;
+    conv_testbench *scratch;
+    uint8_t *msg;
+    size_t msg_len;
+    size_t test_offset;
+    double bpsk_voltage;
+} exhaustive_thread_args;
+
+void *search_exhaustive_thread(void *vargs) {
+    exhaustive_thread_args *args = (exhaustive_thread_args *)vargs;
+    conv_t *conv;
+    for (size_t i = 0; i < args->items_len; i++) {
+        conv = conv_create(args->rate, args->order, args->items[i].poly);
+        args->scratch->encode = conv_encode;
+        args->scratch->encoder = conv;
+        args->scratch->decode = conv_decode;
+        args->scratch->decoder = conv;
+        args->items[i].distances[args->test_offset] += test_conv_noise(args->scratch, args->msg, args->msg_len, args->bpsk_voltage);
+        conv_destroy(conv);
+    }
+    pthread_exit(NULL);
+}
+
+void search_exhaustive(size_t rate, size_t order,
+                       size_t n_bytes, uint8_t *msg,
+                       conv_testbench **scratches, size_t num_scratches,
+                       float *weights,
+                       conv_result_t *items,
+                       size_t items_len, double bpsk_voltage) {
+
+    exhaustive_thread_args *args = malloc(num_scratches * sizeof(exhaustive_thread_args));
+    pthread_t *threads = malloc(num_scratches * sizeof(pthread_t));
+
+    for (size_t i = 0; i < num_scratches; i++) {
+        args[i].rate = rate;
+        args[i].order = order;
+        args[i].items = items;
+        args[i].items_len = items_len;
+        args[i].scratch = scratches[i];
+        args[i].msg = msg;
+        args[i].msg_len = n_bytes;
+        args[i].test_offset = i;
+        args[i].bpsk_voltage = bpsk_voltage;
+        pthread_attr_t attr;
+        pthread_attr_init(&attr);
+        pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+        pthread_create(&threads[i], &attr, search_exhaustive_thread, &args[i]);
+    }
+
+    for (size_t i = 0; i < num_scratches; i++) {
+        pthread_join(threads[i], NULL);
+    }
+
+    free(args);
+    free(threads);
+
+}
+
+void search_exhaustive_init(conv_result_t *items, size_t items_len,
+                            size_t num_scratches) {
+    for (size_t i = 0; i < items_len; i++) {
+        for (size_t j = 0; j < num_scratches; j++) {
+            items[i].distances[j] = 0;
+        }
+    }
+}
+
+void search_exhaustive_fin(conv_result_t *items, size_t items_len,
+                           float *weights, size_t weights_len) {
+    for (size_t i = 0; i < items_len; i++) {
+        items[i].cost = 0;
+        for (size_t j = 0; j < weights_len; j++) {
+            items[i].cost += weights[j] * items[i].distances[j];
+        }
+    }
+
+    qsort(items, items_len, sizeof(conv_result_t), compare_conv_results);
+}
+
+const size_t max_block_len = 16384;
+const size_t max_msg_len = 50000000;
+
+void test(size_t rate, size_t order,
+          conv_tester_t start, conv_testbench **scratches,
+          size_t num_scratches, float *weights,
+          size_t n_bytes, double *eb_n0,
+          double bpsk_bit_energy, size_t n_iter,
+          double bpsk_voltage) {
+
+    uint8_t *msg = malloc(max_block_len * sizeof(uint8_t));
+
+    correct_convolutional_polynomial_t maxcoeff = (1 << order) - 1;
+    correct_convolutional_polynomial_t startcoeff = (1 << (order - 1)) + 1;
+    size_t num_polys = (maxcoeff - startcoeff) / 2 + 1;
+    size_t convs_len = 1;
+    for (size_t i = 0; i < rate; i++) {
+        convs_len *= num_polys;
+    }
+
+    conv_result_t *exhaustive = malloc(convs_len * sizeof(conv_result_t));
+    correct_convolutional_polynomial_t *iter_poly = malloc(rate * sizeof(correct_convolutional_polynomial_t));
+
+    for (size_t i = 0; i < rate; i++) {
+        iter_poly[i] = startcoeff;
+    }
+
+    // init exhaustive with all polys
+    for (size_t i = 0; i < convs_len; i++) {
+        exhaustive[i].poly = malloc(rate * sizeof(correct_convolutional_polynomial_t));
+        exhaustive[i].distances = calloc(num_scratches, sizeof(int));
+        exhaustive[i].cost = 0;
+        memcpy(exhaustive[i].poly, iter_poly, rate * sizeof(correct_convolutional_polynomial_t));
+        // this next loop adds 2 with "carry"
+        for (size_t j = 0; j < rate; j++) {
+            if (iter_poly[j] < maxcoeff) {
+                iter_poly[j] += 2;
+                // no more carries to propagate
+                break;
+            } else {
+                iter_poly[j] = startcoeff;
+            }
+        }
+    }
+    free(iter_poly);
+
+    while (convs_len > 20) {
+        size_t bytes_remaining = n_bytes;
+
+        // call init(), which sets all the error metrics to 0 for our new run
+        search_exhaustive_init(exhaustive, convs_len, num_scratches);
+
+        while (bytes_remaining) {
+            // in order to keep memory usage constant, we separate the msg into
+            // blocks and send each one through
+            // each time we do this, we have to calculate a new noise for each
+            // testbench
+
+            size_t block_len = (max_block_len < bytes_remaining) ? max_block_len : bytes_remaining;
+            bytes_remaining -= block_len;
+
+            for (unsigned int j = 0; j < block_len; j++) {
+                msg[j] = rand() % 256;
+            }
+
+            for (size_t i = 0; i < num_scratches; i++) {
+                scratches[i] = resize_conv_testbench(scratches[i], conv_enclen, start.conv, block_len);
+                build_white_noise(scratches[i]->noise, scratches[i]->enclen, eb_n0[i], bpsk_bit_energy);
+            }
+
+            search_exhaustive(rate, order,
+                              block_len, msg, scratches, num_scratches, weights,
+                              exhaustive, convs_len, bpsk_voltage);
+        }
+
+        // call fin(), which calculates a cost metric for all of the distances
+        // added by our msg block iterations and then sorts by this metric
+        search_exhaustive_fin(exhaustive, convs_len, weights, num_scratches);
+
+        // decide parameters for next loop iter
+        // if we've reduced to 20 or fewer items, we're going to just select
+        // those and declare the test done
+        size_t new_convs_len = (convs_len / 2) < 20 ? 20 : convs_len / 2;
+
+        // normally we'll double the message length each time we halve
+        // the number of entries so that each iter takes roughly the
+        // same time but has twice the resolution of the previous run.
+        //
+        // however, if we've reached max_msg_len, then we assume that
+        // the error stats collected are likely converged to whatever
+        // final value they'll take, and adding more length will not
+        // help us get better metrics. if we're at that point, then
+        // we just select the top 20 items and declare them winners
+        if (n_bytes >= max_msg_len) {
+            // converged case
+            new_convs_len = 20;
+        } else {
+            // increase our error metric resolution next run
+            n_bytes *= 2;
+            n_bytes = (n_bytes < max_msg_len) ? n_bytes : max_msg_len;
+        }
+        for (size_t i = new_convs_len; i < convs_len; i++) {
+            // these entries lost, free their memory here
+            free(exhaustive[i].poly);
+            free(exhaustive[i].distances);
+        }
+        convs_len = new_convs_len;
+        printf("exhaustive run: %zu items remain\n", convs_len);
+    }
+
+    for (size_t i = 0; i < convs_len; i++) {
+        for (size_t j = 0; j < rate; j++) {
+            printf(" %06o", exhaustive[i].poly[j]);
+        }
+        printf(":");
+        for (size_t j = 0; j < num_scratches; j++) {
+            printf(" %.2e@%.1fdB", exhaustive[i].distances[j]/((float)n_bytes * 8), eb_n0[j]);
+        }
+        printf("\n");
+    }
+
+    for (size_t i = 0; i < convs_len; i++) {
+        free(exhaustive[i].poly);
+        free(exhaustive[i].distances);
+    }
+    free(exhaustive);
+    free(msg);
+}
+
+int main(int argc, char **argv) {
+    srand(time(NULL));
+
+    size_t rate, order, n_bytes, n_iter;
+
+    sscanf(argv[1], "%zu", &rate);
+    sscanf(argv[2], "%zu", &order);
+    sscanf(argv[3], "%zu", &n_bytes);
+    sscanf(argv[4], "%zu", &n_iter);
+
+    double bpsk_voltage = 1.0/sqrt(2.0);
+    double bpsk_sym_energy = 2*pow(bpsk_voltage, 2.0);
+    double bpsk_bit_energy = bpsk_sym_energy/1.0;
+
+    bpsk_bit_energy = bpsk_sym_energy * rate;  // rate bits transmitted for every input bit
+
+    correct_convolutional_polynomial_t maxcoeff = (1 << order) - 1;
+    correct_convolutional_polynomial_t startcoeff = (1 << (order - 1)) + 1;
+
+    conv_tester_t start;
+
+    start.poly = malloc(rate * sizeof(correct_convolutional_polynomial_t));
+
+    for (size_t i = 0; i < rate; i++) {
+        start.poly[i] = ((maxcoeff - startcoeff) / 2) + startcoeff + 1;
+    }
+
+    start.conv = conv_create(rate, order, start.poly);
+
+    size_t num_scratches = 4;
+    float *weights;
+    conv_testbench **scratches = malloc(num_scratches * sizeof(conv_testbench *));
+    double *eb_n0;
+
+    for (size_t i = 0; i < num_scratches; i++) {
+        scratches[i] = resize_conv_testbench(NULL, conv_enclen, start.conv, max_block_len);
+    }
+
+    switch (order) {
+        case 6:
+            eb_n0 = (double[]){6.0, 5.5, 5.0, 4.5};
+            weights = (float[]){8000, 400, 20, 1};
+            break;
+        case 7:
+            eb_n0 = (double[]){5.5, 5.0, 4.5, 4.0};
+            weights = (float[]){8000, 400, 20, 1};
+            break;
+        case 8:
+        case 9:
+            eb_n0 = (double[]){5.0, 4.5, 4.0, 3.5};
+            weights = (float[]){8000, 400, 20, 1};
+            break;
+        default:
+            eb_n0 = (double[]){4.5, 4.0, 3.5, 3.0};
+            weights = (float[]){8000, 400, 20, 1};
+    }
+
+    test(rate, order, start, scratches, num_scratches, weights, n_bytes, eb_n0, bpsk_bit_energy, n_iter, bpsk_voltage);
+
+    free(start.poly);
+    conv_destroy(start.conv);
+    for (size_t i = 0; i < num_scratches; i++) {
+        free_scratch(scratches[i]);
+    }
+    free(scratches);
+
+    return 0;
+}
--- a/core/libcorrect/tools/find_conv_optim_poly_annealing.c
+++ b/core/libcorrect/tools/find_conv_optim_poly_annealing.c
@@ -0,0 +1,350 @@
+#include <stdbool.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+#include <stddef.h>
+#include <limits.h>
+#include <pthread.h>
+#include <signal.h>
+
+#if HAVE_SSE
+#include "correct/util/error-sim-sse.h"
+typedef correct_convolutional_sse conv_t;
+static conv_t*(*conv_create)(size_t, size_t, const uint16_t *) = correct_convolutional_sse_create;
+static void(*conv_destroy)(conv_t *) = correct_convolutional_sse_destroy;
+static size_t(*conv_enclen)(void *, size_t) = conv_correct_sse_enclen;
+static void(*conv_encode)(void *, uint8_t *, size_t, uint8_t *) = conv_correct_sse_encode;
+static void(*conv_decode)(void *, uint8_t *, size_t, uint8_t *) = conv_correct_sse_decode;
+#else
+#include "correct/util/error-sim.h"
+typedef correct_convolutional conv_t;
+static conv_t*(*conv_create)(size_t, size_t, const uint16_t *) = correct_convolutional_create;
+static void(*conv_destroy)(conv_t *) = correct_convolutional_destroy;
+static size_t(*conv_enclen)(void *, size_t) = conv_correct_enclen;
+static void(*conv_encode)(void *, uint8_t *, size_t, uint8_t *) = conv_correct_encode;
+static void(*conv_decode)(void *, uint8_t *, size_t, uint8_t *) = conv_correct_decode;
+#endif
+
+typedef struct {
+    conv_t *conv;
+    correct_convolutional_polynomial_t *poly;
+} conv_tester_t;
+
+void shuffle(int *a, size_t len) {
+    for (size_t i = 0; i < len - 2; i++) {
+        size_t j = rand() % (len - i) + i;
+        int temp = a[i];
+        a[i] = a[j];
+        a[j] = temp;
+    }
+}
+
+int rand_geo(float p, int max) {
+    int geo = 1;
+    while (geo < max) {
+        if (rand() / (float)RAND_MAX > p) {
+            geo++;
+        } else {
+            break;
+        }
+    }
+    return geo;
+}
+
+void next_neighbor(correct_convolutional_polynomial_t *start,
+                   correct_convolutional_polynomial_t *neighbor, size_t rate, size_t order) {
+    int coeffs[rate * (order - 2)];
+    for (int i = 0; i < rate * (order - 2); i++) {
+        coeffs[i] = i;
+    }
+    shuffle(coeffs, rate * (order - 2));
+
+    memcpy(neighbor, start, rate * sizeof(correct_convolutional_polynomial_t));
+    size_t nflips = rand_geo(0.4, rate * (order - 2));
+    for (int i = 0; i < nflips; i++) {
+        ptrdiff_t index = coeffs[i] / (order - 2);
+        // decide which bit to flip
+        // we avoid the edge bits to prevent creating a degenerate poly
+        neighbor[index] ^= 1 << (coeffs[i] % (order - 2) + 1);
+    }
+}
+
+bool accept(float cost_a, float cost_b, double temperature) {
+    if (cost_b < cost_a) {
+        return true;
+    }
+
+    float p = (float)(rand()) / (float)(RAND_MAX);
+
+    return exp((cost_a - cost_b) / (cost_a * temperature)) > p;
+}
+
+typedef struct {
+    size_t rate;
+    size_t order;
+    correct_convolutional_polynomial_t *poly;
+    unsigned int distance;
+    conv_testbench *scratch;
+    size_t msg_len;
+    double eb_n0;
+    double bpsk_voltage;
+    double bpsk_bit_energy;
+} thread_args;
+
+const size_t max_block_len = 16384;
+
+void *find_cost_thread(void *vargs) {
+    thread_args *args = (thread_args *)vargs;
+    conv_t *conv;
+    uint8_t *msg = malloc(max_block_len);
+
+    conv = conv_create(args->rate, args->order, args->poly);
+    args->distance = 0;
+    conv_testbench *scratch = args->scratch;
+
+    size_t bytes_remaining = args->msg_len;
+    while (bytes_remaining) {
+        // in order to keep memory usage constant, we separate the msg into
+        // blocks and send each one through
+        // each time we do this, we have to calculate a new noise for each
+        // testbench
+        size_t block_len = (max_block_len < bytes_remaining) ? max_block_len : bytes_remaining;
+        bytes_remaining -= block_len;
+
+        for (unsigned int j = 0; j < block_len; j++) {
+            msg[j] = rand() % 256;
+        }
+
+        scratch = resize_conv_testbench(scratch, conv_enclen, conv, block_len);
+        scratch->encode = conv_encode;
+        scratch->encoder = conv;
+        scratch->decode = conv_decode;
+        scratch->decoder = conv;
+
+        build_white_noise(scratch->noise, scratch->enclen, args->eb_n0, args->bpsk_bit_energy);
+
+        args->distance += test_conv_noise(scratch, msg, block_len, args->bpsk_voltage);
+    }
+    conv_destroy(conv);
+    free(msg);
+    pthread_exit(NULL);
+}
+
+float find_cost(size_t rate, size_t order, correct_convolutional_polynomial_t *poly, size_t msg_len,
+                conv_testbench **scratches, size_t num_scratches, float *weights, double *eb_n0,
+                double bpsk_voltage, double bpsk_bit_energy) {
+    thread_args *args = malloc(num_scratches * sizeof(thread_args));
+    pthread_t *threads = malloc(num_scratches * sizeof(pthread_t));
+
+    for (size_t i = 0; i < num_scratches; i++) {
+        args[i].rate = rate;
+        args[i].order = order;
+        args[i].poly = poly;
+        args[i].scratch = scratches[i];
+        args[i].msg_len = msg_len;
+        args[i].eb_n0 = eb_n0[i];
+        args[i].bpsk_voltage = bpsk_voltage;
+        args[i].bpsk_bit_energy = bpsk_bit_energy;
+        pthread_attr_t attr;
+        pthread_attr_init(&attr);
+        pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+        pthread_create(&threads[i], &attr, find_cost_thread, &args[i]);
+    }
+
+    for (size_t i = 0; i < num_scratches; i++) {
+        pthread_join(threads[i], NULL);
+    }
+
+    float cost = 0;
+    printf("poly:");
+    for (size_t i = 0; i < rate; i++) {
+        printf(" %06o", poly[i]);
+    }
+    printf(" error:");
+    for (size_t i = 0; i < num_scratches; i++) {
+        cost += weights[i] * args[i].distance;
+        printf(" %.2e@%.1fdB", (args[i].distance / (float)(msg_len * 8)), eb_n0[i]);
+    }
+    printf("\n");
+
+    free(args);
+    free(threads);
+
+    return cost;
+}
+
+static bool terminated = false;
+
+void sig_handler(int sig) {
+    if (sig == SIGINT || sig == SIGTERM || sig == SIGHUP) {
+        if (!terminated) {
+            terminated = true;
+            printf("terminating after current poly\n");
+        }
+    }
+}
+
+void search_simulated_annealing(size_t rate, size_t order, size_t n_steps, conv_tester_t *start,
+                                size_t n_bytes, conv_testbench **scratches, size_t num_scratches,
+                                float *weights, double start_temperature, double cooling_factor,
+                                double *eb_n0, double bpsk_voltage, double bpsk_bit_energy) {
+    // perform simulated annealing to find the optimal polynomial
+
+    float cost = find_cost(rate, order, start->poly, n_bytes, scratches, num_scratches, weights,
+                           eb_n0, bpsk_voltage, bpsk_bit_energy);
+
+    correct_convolutional_polynomial_t *neighbor_poly =
+        malloc(rate * sizeof(correct_convolutional_polynomial_t));
+
+    correct_convolutional_polynomial_t *state =
+        malloc(rate * sizeof(correct_convolutional_polynomial_t));
+    correct_convolutional_polynomial_t *best =
+        malloc(rate * sizeof(correct_convolutional_polynomial_t));
+
+    float best_cost = cost;
+
+    memcpy(state, start->poly, rate * sizeof(correct_convolutional_polynomial_t));
+    memcpy(best, start->poly, rate * sizeof(correct_convolutional_polynomial_t));
+
+    double temperature = start_temperature;
+
+    for (size_t i = 0; i < n_steps; i++) {
+        next_neighbor(state, neighbor_poly, rate, order);
+        float neighbor_cost =
+            find_cost(rate, order, neighbor_poly, n_bytes, scratches, num_scratches, weights, eb_n0,
+                      bpsk_voltage, bpsk_bit_energy);
+        if (accept(cost, neighbor_cost, temperature)) {
+            // we're moving to our neighbor's house
+            memcpy(state, neighbor_poly, rate * sizeof(correct_convolutional_polynomial_t));
+            cost = neighbor_cost;
+        } else {
+            // actually where we live now is nice
+        }
+
+        if (cost < best_cost) {
+            best_cost = cost;
+            memcpy(best, state, rate * sizeof(correct_convolutional_polynomial_t));
+        }
+
+        temperature *= cooling_factor;
+
+        if (terminated) {
+            break;
+        }
+    }
+
+    printf("last state:");
+    for (size_t i = 0; i < rate; i++) {
+        printf(" %06o", state[i]);
+    }
+    printf("\n");
+
+    printf("best state:");
+    for (size_t i = 0; i < rate; i++) {
+        printf(" %06o", best[i]);
+    }
+
+    memcpy(start->poly, best, rate * sizeof(correct_convolutional_polynomial_t));
+
+    free(state);
+    free(best);
+    free(neighbor_poly);
+}
+
+void test_sa(size_t rate, size_t order, conv_tester_t start, conv_testbench **scratches,
+             size_t num_scratches, float *weights, size_t n_bytes, double *eb_n0,
+             double bpsk_bit_energy, size_t n_iter, double bpsk_voltage) {
+    for (size_t i = 0; i < n_iter; i++) {
+        double temperature = (i == 0) ? 0.5 : 250;
+        double cooling_factor = (i == 0) ? 0.985 : 0.95;
+        size_t n_steps = (i == 0) ? 500 : 100;
+
+        search_simulated_annealing(rate, order, n_steps, &start, n_bytes, scratches, num_scratches,
+                                   weights, temperature, cooling_factor, eb_n0, bpsk_voltage,
+                                   bpsk_bit_energy);
+    }
+}
+
+int main(int argc, char **argv) {
+    srand(time(NULL));
+
+    signal(SIGINT, sig_handler);
+    signal(SIGTERM, sig_handler);
+    signal(SIGHUP, sig_handler);
+
+    size_t rate, order, n_bytes, n_iter;
+
+    sscanf(argv[1], "%zu", &rate);
+    sscanf(argv[2], "%zu", &order);
+    sscanf(argv[3], "%zu", &n_bytes);
+    sscanf(argv[4], "%zu", &n_iter);
+
+    double bpsk_voltage = 1.0 / sqrt(2.0);
+    double bpsk_sym_energy = 2 * pow(bpsk_voltage, 2.0);
+    double bpsk_bit_energy = bpsk_sym_energy / 1.0;
+
+    bpsk_bit_energy = bpsk_sym_energy * rate;  // rate bits transmitted for every input bit
+
+    // correct_convolutional_polynomial_t maxcoeff = (1 << order) - 1;
+    correct_convolutional_polynomial_t startcoeff = (1 << (order - 1)) + 1;
+
+    conv_tester_t start;
+
+    start.poly = malloc(rate * sizeof(correct_convolutional_polynomial_t));
+
+    for (size_t i = 0; i < rate; i++) {
+        start.poly[i] = ((rand() % (1 << (order - 2))) << 1) + startcoeff;
+    }
+
+    start.conv = conv_create(rate, order, start.poly);
+
+    size_t num_scratches = 4;
+    float *weights;
+    conv_testbench **scratches = malloc(num_scratches * sizeof(conv_testbench *));
+    double *eb_n0;
+
+    for (size_t i = 0; i < num_scratches; i++) {
+        scratches[i] = resize_conv_testbench(NULL, conv_enclen, start.conv, max_block_len);
+    }
+
+    switch (order) {
+        case 6:
+            eb_n0 = (double[]){6.0, 5.5, 5.0, 4.5};
+            weights = (float[]){8000, 400, 20, 1};
+            break;
+        case 7:
+        case 8:
+            eb_n0 = (double[]){5.5, 5.0, 4.5, 4.0};
+            weights = (float[]){8000, 400, 20, 1};
+            break;
+        case 9:
+        case 10:
+            eb_n0 = (double[]){5.0, 4.5, 4.0, 3.5};
+            weights = (float[]){8000, 400, 20, 1};
+            break;
+        case 11:
+        case 12:
+        case 13:
+            eb_n0 = (double[]){4.5, 4.0, 3.5, 3.0};
+            weights = (float[]){8000, 400, 20, 1};
+            break;
+        default:
+            eb_n0 = (double[]){3.5, 3.0, 2.5, 2.0};
+            weights = (float[]){8000, 400, 20, 1};
+    }
+
+    test_sa(rate, order, start, scratches, num_scratches, weights, n_bytes, eb_n0, bpsk_bit_energy,
+            n_iter, bpsk_voltage);
+
+    free(start.poly);
+    conv_destroy(start.conv);
+    for (size_t i = 0; i < num_scratches; i++) {
+        free_scratch(scratches[i]);
+    }
+    free(scratches);
+
+    return 0;
+}
--- a/core/libcorrect/tools/find_rs_primitive_poly.c
+++ b/core/libcorrect/tools/find_rs_primitive_poly.c
@@ -0,0 +1,51 @@
+#include "correct/reed-solomon.h"
+
+size_t block_size = 255;
+int power_max = 8;
+
+// visit all of the elements from the poly
+bool trypoly(field_operation_t poly, field_logarithm_t *log) {
+    memset(log, 0, block_size + 1);
+    field_operation_t element = 1;
+    log[0] = (field_logarithm_t)0;
+    for (field_operation_t i = 1; i < block_size + 1; i++) {
+        element = element * 2;
+        element = (element > block_size) ? (element ^ poly) : element;
+        if (log[element] != 0) {
+            return false;
+        }
+        log[element] = (field_logarithm_t)i;
+    }
+    return true;
+}
+
+int main() {
+    field_logarithm_t *log = malloc((block_size + 1) * sizeof(field_logarithm_t));
+    for (field_operation_t i = (block_size + 1); i < (block_size + 1) << 1; i++) {
+        if (trypoly(i, log)) {
+            printf("0x%x valid: ", i);
+            field_operation_t poly = i;
+            int power = power_max;
+            while(poly) {
+                if (poly & (block_size + 1)) {
+                    if (power > 1) {
+                        printf("x^%d", power);
+                    } else if (power) {
+                        printf("x");
+                    } else {
+                        printf("1");
+                    }
+                    if (poly & block_size) {
+                        printf(" + ");
+                    }
+                }
+                power--;
+                poly <<= 1;
+                poly &= (block_size << 1) + 1;
+            }
+            printf("\n");
+        }
+    }
+    free(log);
+    return 0;
+}
--- a/core/libcorrect/util/CMakeLists.txt
+++ b/core/libcorrect/util/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_library(error_sim OBJECT error-sim.c)
+
+add_library(error_sim_shim OBJECT error-sim.c error-sim-shim.c)
+
+if(HAVE_LIBFEC)
+    add_library(error_sim_fec OBJECT error-sim.c error-sim-fec.c)
+endif()
+
+if(HAVE_SSE)
+    add_library(error_sim_sse OBJECT error-sim.c error-sim-sse.c)
+endif()
--- a/core/libcorrect/util/error-sim-fec.c
+++ b/core/libcorrect/util/error-sim-fec.c
@@ -0,0 +1,29 @@
+#include "correct/util/error-sim-fec.h"
+
+void conv_fec27_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg) {
+    init_viterbi27(conv_v, 0);
+    update_viterbi27_blk(conv_v, soft, soft_len / 2 - 2);
+    size_t n_decoded_bits = (soft_len / 2) - 8;
+    chainback_viterbi27(conv_v, msg, n_decoded_bits, 0);
+}
+
+void conv_fec29_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg) {
+    init_viterbi29(conv_v, 0);
+    update_viterbi29_blk(conv_v, soft, soft_len / 2 - 2);
+    size_t n_decoded_bits = (soft_len / 2) - 10;
+    chainback_viterbi29(conv_v, msg, n_decoded_bits, 0);
+}
+
+void conv_fec39_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg) {
+    init_viterbi39(conv_v, 0);
+    update_viterbi39_blk(conv_v, soft, soft_len / 3 - 2);
+    size_t n_decoded_bits = (soft_len / 3) - 10;
+    chainback_viterbi39(conv_v, msg, n_decoded_bits, 0);
+}
+
+void conv_fec615_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg) {
+    init_viterbi615(conv_v, 0);
+    update_viterbi615_blk(conv_v, soft, soft_len / 6 - 2);
+    size_t n_decoded_bits = (soft_len / 6) - 16;
+    chainback_viterbi615(conv_v, msg, n_decoded_bits, 0);
+}
--- a/core/libcorrect/util/error-sim-shim.c
+++ b/core/libcorrect/util/error-sim-shim.c
@@ -0,0 +1,33 @@
+#include "correct/util/error-sim-shim.h"
+
+ssize_t conv_shim27_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg) {
+    init_viterbi27(conv_v, 0);
+    update_viterbi27_blk(conv_v, soft, soft_len / 2 - 2);
+    size_t n_decoded_bits = (soft_len / 2) - 8;
+    chainback_viterbi27(conv_v, msg, n_decoded_bits, 0);
+    return (n_decoded_bits % 8) ? (n_decoded_bits / 8) + 1 : n_decoded_bits / 8;
+}
+
+ssize_t conv_shim29_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg) {
+    init_viterbi29(conv_v, 0);
+    update_viterbi29_blk(conv_v, soft, soft_len / 2 - 2);
+    size_t n_decoded_bits = (soft_len / 2) - 10;
+    chainback_viterbi29(conv_v, msg, n_decoded_bits, 0);
+    return (n_decoded_bits % 8) ? (n_decoded_bits / 8) + 1 : n_decoded_bits / 8;
+}
+
+ssize_t conv_shim39_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg) {
+    init_viterbi39(conv_v, 0);
+    update_viterbi39_blk(conv_v, soft, soft_len / 3 - 2);
+    size_t n_decoded_bits = (soft_len / 3) - 10;
+    chainback_viterbi39(conv_v, msg, n_decoded_bits, 0);
+    return (n_decoded_bits % 8) ? (n_decoded_bits / 8) + 1 : n_decoded_bits / 8;
+}
+
+ssize_t conv_shim615_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg) {
+    init_viterbi615(conv_v, 0);
+    update_viterbi615_blk(conv_v, soft, soft_len / 6 - 2);
+    size_t n_decoded_bits = (soft_len / 6) - 16;
+    chainback_viterbi615(conv_v, msg, n_decoded_bits, 0);
+    return (n_decoded_bits % 8) ? (n_decoded_bits / 8) + 1 : n_decoded_bits / 8;
+}
--- a/core/libcorrect/util/error-sim-sse.c
+++ b/core/libcorrect/util/error-sim-sse.c
@@ -0,0 +1,13 @@
+#include "correct/util/error-sim-sse.h"
+
+size_t conv_correct_sse_enclen(void *conv_v, size_t msg_len) {
+    return correct_convolutional_sse_encode_len((correct_convolutional_sse *)conv_v, msg_len);
+}
+
+void conv_correct_sse_encode(void *conv_v, uint8_t *msg, size_t msg_len, uint8_t *encoded) {
+    correct_convolutional_sse_encode((correct_convolutional_sse *)conv_v, msg, msg_len, encoded);
+}
+
+ssize_t conv_correct_sse_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg) {
+    return correct_convolutional_sse_decode_soft((correct_convolutional_sse *)conv_v, soft, soft_len, msg);
+}
--- a/core/libcorrect/util/error-sim.c
+++ b/core/libcorrect/util/error-sim.c
@@ -0,0 +1,188 @@
+#include "correct/util/error-sim.h"
+
+size_t distance(uint8_t *a, uint8_t *b, size_t len) {
+    size_t dist = 0;
+    for (size_t i = 0; i < len; i++) {
+        if (a[i] != b[i]) {
+
+        }
+        dist += popcount((unsigned int)a[i] ^ (unsigned int)b[i]);
+    }
+    return dist;
+}
+
+void gaussian(double *res, size_t n_res, double sigma) {
+    for (size_t i = 0; i < n_res; i += 2) {
+        // compute using polar method of box muller
+        double s, u, v;
+        while (true) {
+            u = (double)(rand())/(double)RAND_MAX;
+            v = (double)(rand())/(double)RAND_MAX;
+
+            s = pow(u, 2.0) + pow(v, 2.0);
+
+            if (s > DBL_EPSILON && s < 1) {
+                break;
+            }
+        }
+
+        double base = sqrt((-2.0 * log(s))/s);
+
+        double z0 = u * base;
+        res[i] = z0 * sigma;
+
+        if (i + 1 < n_res) {
+            double z1 = v * base;
+            res[i + 1] = z1 * sigma;
+        }
+    }
+}
+
+void encode_bpsk(uint8_t *msg, double *voltages, size_t n_syms, double bpsk_voltage) {
+    uint8_t mask = 0x80;
+    for (size_t i = 0; i < n_syms; i++) {
+        voltages[i] = msg[i/8] & mask ? bpsk_voltage : -bpsk_voltage;
+        mask >>= 1;
+        if (!mask) {
+            mask = 0x80;
+        }
+    }
+}
+
+void byte2bit(uint8_t *bytes, uint8_t *bits, size_t n_bits) {
+    unsigned char cmask = 0x80;
+    for (size_t i = 0; i < n_bits; i++) {
+        bits[i] = (bytes[i/8] & cmask) ? 255 : 0;
+        cmask >>= 1;
+        if (!cmask) {
+            cmask = 0x80;
+        }
+    }
+}
+
+void decode_bpsk(uint8_t *soft, uint8_t *msg, size_t n_syms) {
+    uint8_t mask = 0x80;
+    for (size_t i = 0; i < n_syms; i++) {
+        uint8_t bit = soft[i] > 127 ? 1 : 0;
+        if (bit) {
+            msg[i/8] |= mask;
+        }
+        mask >>= 1;
+        if (!mask) {
+            mask = 0x80;
+        }
+    }
+}
+
+void decode_bpsk_soft(double *voltages, uint8_t *soft, size_t n_syms, double bpsk_voltage) {
+    for (size_t i = 0; i < n_syms; i++) {
+        double rel = voltages[i]/bpsk_voltage;
+        if (rel > 1) {
+            soft[i] = 255;
+        } else if (rel < -1) {
+            soft[i] = 0;
+        } else {
+            soft[i] = (uint8_t)(127.5 + 127.5 * rel);
+        }
+    }
+}
+
+double log2amp(double l) {
+    return pow(10.0, l/10.0);
+}
+
+double amp2log(double a) {
+    return 10.0 * log10(a);
+}
+
+double sigma_for_eb_n0(double eb_n0, double bpsk_bit_energy) {
+    // eb/n0 is the ratio of bit energy to noise energy
+    // eb/n0 is expressed in dB so first we convert to amplitude
+    double eb_n0_amp = log2amp(eb_n0);
+    // now the conversion. sigma^2 = n0/2 = ((eb/n0)^-1 * eb)/2 = eb/(2 * (eb/n0))
+    return sqrt(bpsk_bit_energy/(double)(2.0 * eb_n0_amp));
+}
+
+void build_white_noise(double *noise, size_t n_syms, double eb_n0, double bpsk_bit_energy) {
+    double sigma = sigma_for_eb_n0(eb_n0, bpsk_bit_energy);
+    gaussian(noise, n_syms, sigma);
+}
+
+void add_white_noise(double *signal, double *noise, size_t n_syms) {
+    const double sqrt_2 = sqrt(2);
+    for (size_t i = 0; i < n_syms; i++) {
+        // we want to add the noise in to the signal
+        // but we can't add them directly, because they're expressed as magnitudes
+        //   and the signal is real valued while the noise is complex valued
+
+        // we'll assume that the noise is exactly half real, half imaginary
+        // which means it forms a 90-45-45 triangle in the complex plane
+        // that means that the magnitude we have here is sqrt(2) * the real valued portion
+        // so, we'll divide by sqrt(2)
+        // (we are effectively throwing away the complex portion)
+        signal[i] += noise[i]/sqrt_2;
+    }
+}
+
+conv_testbench *resize_conv_testbench(conv_testbench *scratch, size_t (*enclen_f)(void *, size_t), void *enc, size_t msg_len) {
+    if (!scratch) {
+        scratch = calloc(1, sizeof(conv_testbench));
+    }
+
+    scratch->msg_out = realloc(scratch->msg_out, msg_len);
+
+    size_t enclen = enclen_f(enc, msg_len);
+    size_t enclen_bytes = (enclen % 8) ? (enclen/8 + 1) : enclen/8;
+    scratch->enclen = enclen;
+    scratch->enclen_bytes = enclen_bytes;
+
+    scratch->encoded = realloc(scratch->encoded, enclen_bytes);
+    scratch->v = realloc(scratch->v, enclen * sizeof(double));
+    scratch->corrupted = realloc(scratch->corrupted, enclen * sizeof(double));
+    scratch->noise = realloc(scratch->noise, enclen * sizeof(double));
+    scratch->soft = realloc(scratch->soft, enclen);
+    return scratch;
+}
+
+void free_scratch(conv_testbench *scratch) {
+    free(scratch->msg_out);
+    free(scratch->encoded);
+    free(scratch->v);
+    free(scratch->corrupted);
+    free(scratch->soft);
+    free(scratch->noise);
+    free(scratch);
+}
+
+int test_conv_noise(conv_testbench *scratch, uint8_t *msg, size_t n_bytes,
+                    double bpsk_voltage) {
+    scratch->encode(scratch->encoder, msg, n_bytes, scratch->encoded);
+    encode_bpsk(scratch->encoded, scratch->v, scratch->enclen, bpsk_voltage);
+
+    memcpy(scratch->corrupted, scratch->v, scratch->enclen * sizeof(double));
+    add_white_noise(scratch->corrupted, scratch->noise, scratch->enclen);
+    decode_bpsk_soft(scratch->corrupted, scratch->soft, scratch->enclen, bpsk_voltage);
+
+    memset(scratch->msg_out, 0, n_bytes);
+
+    ssize_t decode_len = scratch->decode(scratch->decoder, scratch->soft, scratch->enclen, scratch->msg_out);
+
+    if (decode_len != n_bytes) {
+        printf("expected to decode %zu bytes, decoded %zu bytes instead\n", n_bytes, decode_len);
+        exit(1);
+    }
+
+    return distance((uint8_t*)msg, scratch->msg_out, n_bytes);
+}
+
+size_t conv_correct_enclen(void *conv_v, size_t msg_len) {
+    return correct_convolutional_encode_len((correct_convolutional *)conv_v, msg_len);
+}
+
+void conv_correct_encode(void *conv_v, uint8_t *msg, size_t msg_len, uint8_t *encoded) {
+    correct_convolutional_encode((correct_convolutional *)conv_v, msg, msg_len, encoded);
+}
+
+ssize_t conv_correct_decode(void *conv_v, uint8_t *soft, size_t soft_len, uint8_t *msg) {
+    return correct_convolutional_decode_soft((correct_convolutional *)conv_v, soft, soft_len, msg);
+}
--- a/core/src/gui/widgets/waterfall.cpp
+++ b/core/src/gui/widgets/waterfall.cpp
@@ -1245,6 +1245,9 @@ namespace ImGui {

    void WaterFall::showWaterfall() {
        buf_mtx.lock();
+        if (rawFFTs ==  NULL) {
+            spdlog::error("Null rawFFT");
+        }
        waterfallVisible = true;
        onResize();
        memset(rawFFTs, 0, waterfallHeight * rawFFTSize * sizeof(float));
--- a/docker_builds/debian_bullseye/do_build.sh
+++ b/docker_builds/debian_bullseye/do_build.sh
@@ -5,7 +5,8 @@ cd /root
 # Install dependencies and tools
 apt update
 apt install -y build-essential cmake git libfftw3-dev libglfw3-dev libglew-dev libvolk2-dev libsoapysdr-dev libairspyhf-dev libairspy-dev \
-            libiio-dev libad9361-dev librtaudio-dev libhackrf-dev librtlsdr-dev libbladerf-dev liblimesuite-dev p7zip-full wget portaudio19-dev
+            libiio-dev libad9361-dev librtaudio-dev libhackrf-dev librtlsdr-dev libbladerf-dev liblimesuite-dev p7zip-full wget portaudio19-dev \
+            libcodec2-dev

 # Install SDRPlay libraries
 wget https://www.sdrplay.com/software/SDRplay_RSP_API-Linux-3.07.1.run
@@ -17,7 +18,7 @@ cp inc/* /usr/include/
 cd SDRPlusPlus
 mkdir build
 cd build
-cmake .. -DOPT_BUILD_BLADERF_SOURCE=ON -DOPT_BUILD_LIMESDR_SOURCE=ON -DOPT_BUILD_SDRPLAY_SOURCE=ON -DOPT_BUILD_NEW_PORTAUDIO_SINK=ON
+cmake .. -DOPT_BUILD_BLADERF_SOURCE=ON -DOPT_BUILD_LIMESDR_SOURCE=ON -DOPT_BUILD_SDRPLAY_SOURCE=ON -DOPT_BUILD_NEW_PORTAUDIO_SINK=ON -DOPT_BUILD_M17_DECODER=ON
 make -j2

 cd ..
--- a/docker_builds/debian_buster/do_build.sh
+++ b/docker_builds/debian_buster/do_build.sh
@@ -5,7 +5,8 @@ cd /root
 # Install dependencies and tools
 apt update
 apt install -y build-essential cmake git libfftw3-dev libglfw3-dev libglew-dev libvolk1-dev libsoapysdr-dev libairspyhf-dev libairspy-dev \
-            libiio-dev libad9361-dev librtaudio-dev libhackrf-dev librtlsdr-dev libbladerf-dev liblimesuite-dev p7zip-full wget portaudio19-dev
+            libiio-dev libad9361-dev librtaudio-dev libhackrf-dev librtlsdr-dev libbladerf-dev liblimesuite-dev p7zip-full wget portaudio19-dev \
+            libcodec2-dev

 # Install SDRPlay libraries
 wget https://www.sdrplay.com/software/SDRplay_RSP_API-Linux-3.07.1.run
@@ -17,7 +18,7 @@ cp inc/* /usr/include/
 cd SDRPlusPlus
 mkdir build
 cd build
-cmake .. -DOPT_BUILD_SDRPLAY_SOURCE=ON -DOPT_BUILD_BLADERF_SOURCE=OFF -DOPT_BUILD_LIMESDR_SOURCE=ON -DOPT_BUILD_NEW_PORTAUDIO_SINK=ON
+cmake .. -DOPT_BUILD_SDRPLAY_SOURCE=ON -DOPT_BUILD_BLADERF_SOURCE=OFF -DOPT_BUILD_LIMESDR_SOURCE=ON -DOPT_BUILD_NEW_PORTAUDIO_SINK=ON -DOPT_BUILD_M17_DECODER=ON
 make -j2

 cd ..
--- a/docker_builds/debian_sid/do_build.sh
+++ b/docker_builds/debian_sid/do_build.sh
@@ -5,7 +5,8 @@ cd /root
 # Install dependencies and tools
 apt update
 apt install -y build-essential cmake git libfftw3-dev libglfw3-dev libglew-dev libvolk2-dev libsoapysdr-dev libairspyhf-dev libairspy-dev \
-            libiio-dev libad9361-dev librtaudio-dev libhackrf-dev librtlsdr-dev libbladerf-dev liblimesuite-dev p7zip-full wget portaudio19-dev
+            libiio-dev libad9361-dev librtaudio-dev libhackrf-dev librtlsdr-dev libbladerf-dev liblimesuite-dev p7zip-full wget portaudio19-dev \
+            libcodec2-dev

 # Install SDRPlay libraries
 wget https://www.sdrplay.com/software/SDRplay_RSP_API-Linux-3.07.1.run
@@ -17,7 +18,7 @@ cp inc/* /usr/include/
 cd SDRPlusPlus
 mkdir build
 cd build
-cmake .. -DOPT_BUILD_BLADERF_SOURCE=ON -DOPT_BUILD_LIMESDR_SOURCE=ON -DOPT_BUILD_SDRPLAY_SOURCE=ON -DOPT_BUILD_NEW_PORTAUDIO_SINK=ON
+cmake .. -DOPT_BUILD_BLADERF_SOURCE=ON -DOPT_BUILD_LIMESDR_SOURCE=ON -DOPT_BUILD_SDRPLAY_SOURCE=ON -DOPT_BUILD_NEW_PORTAUDIO_SINK=ON -DOPT_BUILD_M17_DECODER=ON
 make -j2

 cd ..
--- a/docker_builds/ubuntu_bionic/do_build.sh
+++ b/docker_builds/ubuntu_bionic/do_build.sh
@@ -11,7 +11,8 @@ apt update

 # Install dependencies and tools
 apt install -y build-essential cmake git libfftw3-dev libglfw3-dev libglew-dev libvolk1-dev libsoapysdr-dev libairspy-dev \
-            libiio-dev libad9361-dev librtaudio-dev libhackrf-dev librtlsdr-dev libbladerf-dev liblimesuite-dev p7zip-full wget portaudio19-dev
+            libiio-dev libad9361-dev librtaudio-dev libhackrf-dev librtlsdr-dev libbladerf-dev liblimesuite-dev p7zip-full wget portaudio19-dev \
+            libcodec2-dev

 # Install SDRPlay libraries
 wget https://www.sdrplay.com/software/SDRplay_RSP_API-Linux-3.07.1.run
@@ -35,7 +36,7 @@ cd ../../
 cd SDRPlusPlus
 mkdir build
 cd build
-cmake .. -DOPT_BUILD_SDRPLAY_SOURCE=ON -DOPT_BUILD_BLADERF_SOURCE=OFF -DOPT_BUILD_LIMESDR_SOURCE=ON -DOPT_BUILD_NEW_PORTAUDIO_SINK=ON -DOPT_OVERRIDE_STD_FILESYSTEM=ON
+cmake .. -DOPT_BUILD_SDRPLAY_SOURCE=ON -DOPT_BUILD_BLADERF_SOURCE=OFF -DOPT_BUILD_LIMESDR_SOURCE=ON -DOPT_BUILD_NEW_PORTAUDIO_SINK=ON -DOPT_OVERRIDE_STD_FILESYSTEM=ON -DOPT_BUILD_M17_DECODER=ON
 make -j2

 # Generate package
--- a/docker_builds/ubuntu_focal/do_build.sh
+++ b/docker_builds/ubuntu_focal/do_build.sh
@@ -5,7 +5,8 @@ cd /root
 # Install dependencies and tools
 apt update
 apt install -y build-essential cmake git libfftw3-dev libglfw3-dev libglew-dev libvolk2-dev libsoapysdr-dev libairspyhf-dev libairspy-dev \
-            libiio-dev libad9361-dev librtaudio-dev libhackrf-dev librtlsdr-dev libbladerf-dev liblimesuite-dev p7zip-full wget portaudio19-dev
+            libiio-dev libad9361-dev librtaudio-dev libhackrf-dev librtlsdr-dev libbladerf-dev liblimesuite-dev p7zip-full wget portaudio19-dev \
+            libcodec2-dev

 # Install SDRPlay libraries
 wget https://www.sdrplay.com/software/SDRplay_RSP_API-Linux-3.07.1.run
@@ -17,7 +18,7 @@ cp inc/* /usr/include/
 cd SDRPlusPlus
 mkdir build
 cd build
-cmake .. -DOPT_BUILD_BLADERF_SOURCE=ON -DOPT_BUILD_LIMESDR_SOURCE=ON -DOPT_BUILD_SDRPLAY_SOURCE=ON -DOPT_BUILD_NEW_PORTAUDIO_SINK=ON
+cmake .. -DOPT_BUILD_BLADERF_SOURCE=ON -DOPT_BUILD_LIMESDR_SOURCE=ON -DOPT_BUILD_SDRPLAY_SOURCE=ON -DOPT_BUILD_NEW_PORTAUDIO_SINK=ON -DOPT_BUILD_M17_DECODER=ON
 make -j2

 cd ..
--- a/docker_builds/ubuntu_groovy/do_build.sh
+++ b/docker_builds/ubuntu_groovy/do_build.sh
@@ -5,7 +5,8 @@ cd /root
 # Install dependencies and tools
 apt update
 apt install -y build-essential cmake git libfftw3-dev libglfw3-dev libglew-dev libvolk2-dev libsoapysdr-dev libairspyhf-dev libairspy-dev \
-            libiio-dev libad9361-dev librtaudio-dev libhackrf-dev librtlsdr-dev libbladerf-dev liblimesuite-dev p7zip-full wget portaudio19-dev
+            libiio-dev libad9361-dev librtaudio-dev libhackrf-dev librtlsdr-dev libbladerf-dev liblimesuite-dev p7zip-full wget portaudio19-dev \
+            libcodec2-dev

 # Install SDRPlay libraries
 wget https://www.sdrplay.com/software/SDRplay_RSP_API-Linux-3.07.1.run
@@ -17,7 +18,7 @@ cp inc/* /usr/include/
 cd SDRPlusPlus
 mkdir build
 cd build
-cmake .. -DOPT_BUILD_BLADERF_SOURCE=ON -DOPT_BUILD_LIMESDR_SOURCE=ON -DOPT_BUILD_SDRPLAY_SOURCE=ON -DOPT_BUILD_NEW_PORTAUDIO_SINK=ON
+cmake .. -DOPT_BUILD_BLADERF_SOURCE=ON -DOPT_BUILD_LIMESDR_SOURCE=ON -DOPT_BUILD_SDRPLAY_SOURCE=ON -DOPT_BUILD_NEW_PORTAUDIO_SINK=ON -DOPT_BUILD_M17_DECODER=ON
 make -j2

 cd ..
--- a/docker_builds/ubuntu_hirsute/do_build.sh
+++ b/docker_builds/ubuntu_hirsute/do_build.sh
@@ -5,7 +5,8 @@ cd /root
 # Install dependencies and tools
 apt update
 apt install -y build-essential cmake git libfftw3-dev libglfw3-dev libglew-dev libvolk2-dev libsoapysdr-dev libairspyhf-dev libairspy-dev \
-            libiio-dev libad9361-dev librtaudio-dev libhackrf-dev librtlsdr-dev libbladerf-dev liblimesuite-dev p7zip-full wget portaudio19-dev
+            libiio-dev libad9361-dev librtaudio-dev libhackrf-dev librtlsdr-dev libbladerf-dev liblimesuite-dev p7zip-full wget portaudio19-dev \
+            libcodec2-dev

 # Install SDRPlay libraries
 wget https://www.sdrplay.com/software/SDRplay_RSP_API-Linux-3.07.1.run
@@ -17,7 +18,7 @@ cp inc/* /usr/include/
 cd SDRPlusPlus
 mkdir build
 cd build
-cmake .. -DOPT_BUILD_BLADERF_SOURCE=ON -DOPT_BUILD_LIMESDR_SOURCE=ON -DOPT_BUILD_SDRPLAY_SOURCE=ON -DOPT_BUILD_NEW_PORTAUDIO_SINK=ON
+cmake .. -DOPT_BUILD_BLADERF_SOURCE=ON -DOPT_BUILD_LIMESDR_SOURCE=ON -DOPT_BUILD_SDRPLAY_SOURCE=ON -DOPT_BUILD_NEW_PORTAUDIO_SINK=ON -DOPT_BUILD_M17_DECODER=ON
 make -j2

 cd ..
--- a/m17_decoder/CMakeLists.txt
+++ b/m17_decoder/CMakeLists.txt
@@ -26,11 +26,7 @@ if (MSVC)
    target_include_directories(m17_decoder PUBLIC "C:/Users/ryzerth/Documents/Code/codec2/src")
    target_link_directories(sdrpp_core PUBLIC "C:/Users/ryzerth/Documents/Code/codec2/build/src")

-    target_include_directories(m17_decoder PUBLIC "C:/Program Files (x86)/Correct/include")
-    target_link_directories(sdrpp_core PUBLIC "C:/Program Files (x86)/Correct/lib")
-
    target_link_libraries(m17_decoder PRIVATE libcodec2)
-    target_link_libraries(m17_decoder PRIVATE correct)

 else (MSVC)
    find_package(PkgConfig)
--- a/m17_decoder/src/main.cpp
+++ b/m17_decoder/src/main.cpp
@@ -222,7 +222,7 @@ private:
            ImGui::EndTable();
        }

-        if (ImGui::Checkbox(CONCAT("Show Reference Lines##m17_showlines_", _this->name), &_this->showLines)) {
+        if (ImGui::Checkbox(CONCAT("Show Reference TEST Lines##m17_showlines_", _this->name), &_this->showLines)) {
            if (_this->showLines) {
                _this->diag.lines.push_back(-0.75f);
                _this->diag.lines.push_back(-0.25f);
@@ -290,7 +290,7 @@ private:

    M17LSF lsf;
    std::mutex lsfMtx;
-    std::chrono::system_clock::time_point lastUpdated;
+    std::chrono::time_point<std::chrono::high_resolution_clock> lastUpdated;
 };

 MOD_EXPORT void _INIT_() {