Bugfix + added M17 decoder to the linux CI

2025-07-01 22:57:49 +02:00 · 2021-10-02 17:01:23 +02:00
parent 26fa23c8f5
commit b4213ea049
86 changed files with 6601 additions and 20 deletions
--- a/core/libcorrect/src/convolutional/CMakeLists.txt
+++ b/core/libcorrect/src/convolutional/CMakeLists.txt
@ -0,0 +1,5 @@
+set(SRCFILES bit.c metric.c history_buffer.c error_buffer.c lookup.c convolutional.c encode.c decode.c)
+add_library(correct-convolutional OBJECT ${SRCFILES})
+if(HAVE_SSE)
+    add_subdirectory(sse)
+endif()
--- a/core/libcorrect/src/convolutional/bit.c
+++ b/core/libcorrect/src/convolutional/bit.c
@ -0,0 +1,232 @@
+#include "correct/convolutional/bit.h"
+
+bit_writer_t *bit_writer_create(uint8_t *bytes, size_t len) {
+    bit_writer_t *w = calloc(1, sizeof(bit_writer_t));
+
+    if (bytes) {
+        bit_writer_reconfigure(w, bytes, len);
+    }
+
+    return w;
+}
+
+void bit_writer_reconfigure(bit_writer_t *w, uint8_t *bytes, size_t len) {
+    w->bytes = bytes;
+    w->len = len;
+
+    w->current_byte = 0;
+    w->current_byte_len = 0;
+    w->byte_index = 0;
+}
+
+void bit_writer_destroy(bit_writer_t *w) {
+    free(w);
+}
+
+void bit_writer_write(bit_writer_t *w, uint8_t val, unsigned int n) {
+    for (size_t j = 0; j < n; j++) {
+        bit_writer_write_1(w, val);
+        val >>= 1;
+    }
+}
+
+void bit_writer_write_1(bit_writer_t *w, uint8_t val) {
+    w->current_byte |= val & 1;
+    w->current_byte_len++;
+
+    if (w->current_byte_len == 8) {
+        // 8 bits in a byte -- move to the next byte
+        w->bytes[w->byte_index] = w->current_byte;
+        w->byte_index++;
+        w->current_byte_len = 0;
+        w->current_byte = 0;
+    } else {
+        w->current_byte <<= 1;
+    }
+}
+
+void bit_writer_write_bitlist(bit_writer_t *w, uint8_t *l, size_t len) {
+    // first close the current byte
+    // we might have been given too few elements to do that. be careful.
+    size_t close_len = 8 - w->current_byte_len;
+    close_len = (close_len < len) ? close_len : len;
+
+    uint16_t b = w->current_byte;
+
+    for (ptrdiff_t i = 0; i < close_len; i++) {
+        b |= l[i];
+        b <<= 1;
+    }
+
+
+    l += close_len;
+    len -= close_len;
+
+    uint8_t *bytes = w->bytes;
+    size_t byte_index = w->byte_index;
+
+    if (w->current_byte_len + close_len == 8) {
+        b >>= 1;
+        bytes[byte_index] = b;
+        byte_index++;
+    } else {
+        w->current_byte = b;
+        w->current_byte_len += close_len;
+        return;
+    }
+
+    size_t full_bytes = len/8;
+
+    for (size_t i = 0; i < full_bytes; i++) {
+        bytes[byte_index] = l[0] << 7 | l[1] << 6 | l[2] << 5 |
+                            l[3] << 4 | l[4] << 3 | l[5] << 2 |
+                            l[6] << 1 | l[7];
+        byte_index += 1;
+        l += 8;
+    }
+
+    len -= 8*full_bytes;
+
+    b = 0;
+    for (ptrdiff_t i = 0; i < len; i++) {
+        b |= l[i];
+        b <<= 1;
+    }
+
+    w->current_byte = b;
+    w->byte_index = byte_index;
+    w->current_byte_len = len;
+}
+
+void bit_writer_write_bitlist_reversed(bit_writer_t *w, uint8_t *l, size_t len) {
+    l = l + len - 1;
+
+    uint8_t *bytes = w->bytes;
+    size_t byte_index = w->byte_index;
+    uint16_t b;
+
+    if (w->current_byte_len != 0) {
+        size_t close_len = 8 - w->current_byte_len;
+        close_len = (close_len < len) ? close_len : len;
+
+        b = w->current_byte;
+
+        for (ptrdiff_t i = 0; i < close_len; i++) {
+            b |= *l;
+            b <<= 1;
+            l--;
+        }
+
+        len -= close_len;
+
+        if (w->current_byte_len + close_len == 8) {
+            b >>= 1;
+            bytes[byte_index] = b;
+            byte_index++;
+        } else {
+            w->current_byte = b;
+            w->current_byte_len += close_len;
+            return;
+        }
+    }
+
+    size_t full_bytes = len/8;
+
+    for (size_t i = 0; i < full_bytes; i++) {
+        bytes[byte_index] = l[0] << 7 | l[-1] << 6 | l[-2] << 5 |
+                            l[-3] << 4 | l[-4] << 3 | l[-5] << 2 |
+                            l[-6] << 1 | l[-7];
+        byte_index += 1;
+        l -= 8;
+    }
+
+    len -= 8*full_bytes;
+
+    b = 0;
+    for (ptrdiff_t i = 0; i < len; i++) {
+        b |= *l;
+        b <<= 1;
+        l--;
+    }
+
+    w->current_byte = (uint8_t)b;
+    w->byte_index = byte_index;
+    w->current_byte_len = len;
+}
+
+void bit_writer_flush_byte(bit_writer_t *w) {
+    if (w->current_byte_len != 0) {
+        w->current_byte <<= (8 - w->current_byte_len);
+        w->bytes[w->byte_index] = w->current_byte;
+        w->byte_index++;
+        w->current_byte_len = 0;
+    }
+}
+
+size_t bit_writer_length(bit_writer_t *w) {
+    return w->byte_index;
+}
+
+uint8_t reverse_byte(uint8_t b) {
+    return (b & 0x80) >> 7 | (b & 0x40) >> 5 | (b & 0x20) >> 3 |
+           (b & 0x10) >> 1 | (b & 0x08) << 1 | (b & 0x04) << 3 |
+           (b & 0x02) << 5 | (b & 0x01) << 7;
+}
+
+static uint8_t reverse_table[256];
+
+void create_reverse_table() {
+    for (uint16_t i = 0; i < 256; i++) {
+        reverse_table[i] = reverse_byte(i);
+    }
+}
+
+bit_reader_t *bit_reader_create(const uint8_t *bytes, size_t len) {
+    bit_reader_t *r = calloc(1, sizeof(bit_reader_t));
+
+    static bool reverse_table_created = false;
+
+    if (!reverse_table_created) {
+        create_reverse_table();
+        reverse_table_created = true;
+    }
+
+    if (bytes) {
+        bit_reader_reconfigure(r, bytes, len);
+    }
+
+    return r;
+}
+
+void bit_reader_reconfigure(bit_reader_t *r, const uint8_t *bytes, size_t len) {
+    r->bytes = bytes;
+    r->len = len;
+
+    r->current_byte_len = 8;
+    r->current_byte = bytes[0];
+    r->byte_index = 0;
+}
+
+void bit_reader_destroy(bit_reader_t *r) {
+    free(r);
+}
+
+uint8_t bit_reader_read(bit_reader_t *r, unsigned int n) {
+    unsigned int read = 0;
+    unsigned int n_copy = n;
+
+    if (r->current_byte_len < n) {
+        read = r->current_byte & ((1 << r->current_byte_len) - 1);
+        r->byte_index++;
+        r->current_byte = r->bytes[r->byte_index];
+        n -= r->current_byte_len;
+        r->current_byte_len = 8;
+        read <<= n;
+    }
+
+    uint8_t copy_mask = (1 << n) - 1;
+    copy_mask <<= (r->current_byte_len - n);
+    read |= (r->current_byte & copy_mask) >> (r->current_byte_len - n);
+    r->current_byte_len -= n;
+    return reverse_table[read] >> (8 - n_copy);
+}
--- a/core/libcorrect/src/convolutional/convolutional.c
+++ b/core/libcorrect/src/convolutional/convolutional.c
@ -0,0 +1,59 @@
+#include "correct/convolutional/convolutional.h"
+
+// https://www.youtube.com/watch?v=b3_lVSrPB6w
+
+correct_convolutional *_correct_convolutional_init(correct_convolutional *conv,
+                                                   size_t rate, size_t order,
+                                                   const polynomial_t *poly) {
+    if (order > 8 * sizeof(shift_register_t)) {
+        // XXX turn this into an error code
+        // printf("order must be smaller than 8 * sizeof(shift_register_t)\n");
+        return NULL;
+    }
+    if (rate < 2) {
+        // XXX turn this into an error code
+        // printf("rate must be 2 or greater\n");
+        return NULL;
+    }
+
+    conv->order = order;
+    conv->rate = rate;
+    conv->numstates = 1 << order;
+
+    unsigned int *table = malloc(sizeof(unsigned int) * (1 << order));
+    fill_table(conv->rate, conv->order, poly, table);
+    *(unsigned int **)&conv->table = table;
+
+    conv->bit_writer = bit_writer_create(NULL, 0);
+    conv->bit_reader = bit_reader_create(NULL, 0);
+
+    conv->has_init_decode = false;
+    return conv;
+}
+
+correct_convolutional *correct_convolutional_create(size_t rate, size_t order,
+                                                    const polynomial_t *poly) {
+    correct_convolutional *conv = malloc(sizeof(correct_convolutional));
+    correct_convolutional *init_conv = _correct_convolutional_init(conv, rate, order, poly);
+    if (!init_conv) {
+        free(conv);
+    }
+    return init_conv;
+}
+
+void _correct_convolutional_teardown(correct_convolutional *conv) {
+    free(*(unsigned int **)&conv->table);
+    bit_writer_destroy(conv->bit_writer);
+    bit_reader_destroy(conv->bit_reader);
+    if (conv->has_init_decode) {
+        pair_lookup_destroy(conv->pair_lookup);
+        history_buffer_destroy(conv->history_buffer);
+        error_buffer_destroy(conv->errors);
+        free(conv->distances);
+    }
+}
+
+void correct_convolutional_destroy(correct_convolutional *conv) {
+    _correct_convolutional_teardown(conv);
+    free(conv);
+}
--- a/core/libcorrect/src/convolutional/decode.c
+++ b/core/libcorrect/src/convolutional/decode.c
@ -0,0 +1,321 @@
+#include "correct/convolutional/convolutional.h"
+
+void conv_decode_print_iter(correct_convolutional *conv, unsigned int iter,
+                            unsigned int winner_index) {
+    if (iter < 2220) {
+        return;
+    }
+    printf("iteration: %d\n", iter);
+    distance_t *errors = conv->errors->write_errors;
+    printf("errors:\n");
+    for (shift_register_t i = 0; i < conv->numstates / 2; i++) {
+        printf("%2d: %d\n", i, errors[i]);
+    }
+    printf("\n");
+    printf("history:\n");
+    for (shift_register_t i = 0; i < conv->numstates / 2; i++) {
+        printf("%2d: ", i);
+        for (unsigned int j = 0; j <= winner_index; j++) {
+            printf("%d", conv->history_buffer->history[j][i] ? 1 : 0);
+        }
+        printf("\n");
+    }
+    printf("\n");
+}
+
+void convolutional_decode_warmup(correct_convolutional *conv, unsigned int sets,
+                                 const uint8_t *soft) {
+    // first phase: load shiftregister up from 0 (order goes from 1 to conv->order)
+    // we are building up error metrics for the first order bits
+    for (unsigned int i = 0; i < conv->order - 1 && i < sets; i++) {
+        // peel off rate bits from encoded to recover the same `out` as in the encoding process
+        // the difference being that this `out` will have the channel noise/errors applied
+        unsigned int out;
+        if (!soft) {
+            out = bit_reader_read(conv->bit_reader, conv->rate);
+        }
+        const distance_t *read_errors = conv->errors->read_errors;
+        distance_t *write_errors = conv->errors->write_errors;
+        // walk all of the state we have so far
+        for (size_t j = 0; j < (1 << (i + 1)); j += 1) {
+            unsigned int last = j >> 1;
+            distance_t dist;
+            if (soft) {
+                if (conv->soft_measurement == CORRECT_SOFT_LINEAR) {
+                    dist = metric_soft_distance_linear(conv->table[j], soft + i * conv->rate,
+                                                       conv->rate);
+                } else {
+                    dist = metric_soft_distance_quadratic(conv->table[j], soft + i * conv->rate,
+                                                          conv->rate);
+                }
+            } else {
+                dist = metric_distance(conv->table[j], out);
+            }
+            write_errors[j] = dist + read_errors[last];
+        }
+        error_buffer_swap(conv->errors);
+    }
+}
+
+void convolutional_decode_inner(correct_convolutional *conv, unsigned int sets,
+                                const uint8_t *soft) {
+    shift_register_t highbit = 1 << (conv->order - 1);
+    for (unsigned int i = conv->order - 1; i < (sets - conv->order + 1); i++) {
+        distance_t *distances = conv->distances;
+        // lasterrors are the aggregate bit errors for the states of shiftregister for the previous
+        // time slice
+        if (soft) {
+            if (conv->soft_measurement == CORRECT_SOFT_LINEAR) {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_linear(j, soft + i * conv->rate, conv->rate);
+                }
+            } else {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_quadratic(j, soft + i * conv->rate, conv->rate);
+                }
+            }
+        } else {
+            unsigned int out = bit_reader_read(conv->bit_reader, conv->rate);
+            for (unsigned int i = 0; i < 1 << (conv->rate); i++) {
+                distances[i] = metric_distance(i, out);
+            }
+        }
+        pair_lookup_t pair_lookup = conv->pair_lookup;
+        pair_lookup_fill_distance(pair_lookup, distances);
+
+        // a mask to get the high order bit from the shift register
+        unsigned int num_iter = highbit << 1;
+        const distance_t *read_errors = conv->errors->read_errors;
+        // aggregate bit errors for this time slice
+        distance_t *write_errors = conv->errors->write_errors;
+
+        uint8_t *history = history_buffer_get_slice(conv->history_buffer);
+        // walk through all states, ignoring oldest bit
+        // we will track a best register state (path) and the number of bit errors at that path at
+        // this time slice
+        // this loop considers two paths per iteration (high order bit set, clear)
+        // so, it only runs numstates/2 iterations
+        // we'll update the history for every state and find the path with the least aggregated bit
+        // errors
+
+        // now run the main loop
+        // we calculate 2 sets of 2 register states here (4 states per iter)
+        // this creates 2 sets which share a predecessor, and 2 sets which share a successor
+        //
+        // the first set definition is the two states that are the same except for the least order
+        // bit
+        // these two share a predecessor because their high n - 1 bits are the same (differ only by
+        // newest bit)
+        //
+        // the second set definition is the two states that are the same except for the high order
+        // bit
+        // these two share a successor because the oldest high order bit will be shifted out, and
+        // the other bits will be present in the successor
+        //
+        shift_register_t highbase = highbit >> 1;
+        for (shift_register_t low = 0, high = highbit, base = 0; high < num_iter;
+             low += 8, high += 8, base += 4) {
+            // shifted-right ancestors
+            // low and low_plus_one share low_past_error
+            //   note that they are the same when shifted right by 1
+            // same goes for high and high_plus_one
+            for (shift_register_t offset = 0, base_offset = 0; base_offset < 4;
+                 offset += 2, base_offset += 1) {
+                distance_pair_key_t low_key = pair_lookup.keys[base + base_offset];
+                distance_pair_key_t high_key = pair_lookup.keys[highbase + base + base_offset];
+                distance_pair_t low_concat_dist = pair_lookup.distances[low_key];
+                distance_pair_t high_concat_dist = pair_lookup.distances[high_key];
+
+                distance_t low_past_error = read_errors[base + base_offset];
+                distance_t high_past_error = read_errors[highbase + base + base_offset];
+
+                distance_t low_error = (low_concat_dist & 0xffff) + low_past_error;
+                distance_t high_error = (high_concat_dist & 0xffff) + high_past_error;
+
+                shift_register_t successor = low + offset;
+                distance_t error;
+                uint8_t history_mask;
+                if (low_error <= high_error) {
+                    error = low_error;
+                    history_mask = 0;
+                } else {
+                    error = high_error;
+                    history_mask = 1;
+                }
+                write_errors[successor] = error;
+                history[successor] = history_mask;
+
+                shift_register_t low_plus_one = low + offset + 1;
+
+                distance_t low_plus_one_error = (low_concat_dist >> 16) + low_past_error;
+                distance_t high_plus_one_error = (high_concat_dist >> 16) + high_past_error;
+
+                shift_register_t plus_one_successor = low_plus_one;
+                distance_t plus_one_error;
+                uint8_t plus_one_history_mask;
+                if (low_plus_one_error <= high_plus_one_error) {
+                    plus_one_error = low_plus_one_error;
+                    plus_one_history_mask = 0;
+                } else {
+                    plus_one_error = high_plus_one_error;
+                    plus_one_history_mask = 1;
+                }
+                write_errors[plus_one_successor] = plus_one_error;
+                history[plus_one_successor] = plus_one_history_mask;
+            }
+        }
+
+        history_buffer_process(conv->history_buffer, write_errors, conv->bit_writer);
+        error_buffer_swap(conv->errors);
+    }
+}
+
+void convolutional_decode_tail(correct_convolutional *conv, unsigned int sets,
+                               const uint8_t *soft) {
+    // flush state registers
+    // now we only shift in 0s, skipping 1-successors
+    shift_register_t highbit = 1 << (conv->order - 1);
+    for (unsigned int i = sets - conv->order + 1; i < sets; i++) {
+        // lasterrors are the aggregate bit errors for the states of shiftregister for the previous
+        // time slice
+        const distance_t *read_errors = conv->errors->read_errors;
+        // aggregate bit errors for this time slice
+        distance_t *write_errors = conv->errors->write_errors;
+
+        uint8_t *history = history_buffer_get_slice(conv->history_buffer);
+
+        // calculate the distance from all output states to our sliced bits
+        distance_t *distances = conv->distances;
+        if (soft) {
+            if (conv->soft_measurement == CORRECT_SOFT_LINEAR) {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_linear(j, soft + i * conv->rate, conv->rate);
+                }
+            } else {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_quadratic(j, soft + i * conv->rate, conv->rate);
+                }
+            }
+        } else {
+            unsigned int out = bit_reader_read(conv->bit_reader, conv->rate);
+            for (unsigned int i = 0; i < 1 << (conv->rate); i++) {
+                distances[i] = metric_distance(i, out);
+            }
+        }
+        const unsigned int *table = conv->table;
+
+        // a mask to get the high order bit from the shift register
+        unsigned int num_iter = highbit << 1;
+        unsigned int skip = 1 << (conv->order - (sets - i));
+        unsigned int base_skip = skip >> 1;
+
+        shift_register_t highbase = highbit >> 1;
+        for (shift_register_t low = 0, high = highbit, base = 0; high < num_iter;
+             low += skip, high += skip, base += base_skip) {
+            unsigned int low_output = table[low];
+            unsigned int high_output = table[high];
+            distance_t low_dist = distances[low_output];
+            distance_t high_dist = distances[high_output];
+
+            distance_t low_past_error = read_errors[base];
+            distance_t high_past_error = read_errors[highbase + base];
+
+            distance_t low_error = low_dist + low_past_error;
+            distance_t high_error = high_dist + high_past_error;
+
+            shift_register_t successor = low;
+            distance_t error;
+            uint8_t history_mask;
+            if (low_error < high_error) {
+                error = low_error;
+                history_mask = 0;
+            } else {
+                error = high_error;
+                history_mask = 1;
+            }
+            write_errors[successor] = error;
+            history[successor] = history_mask;
+        }
+
+        history_buffer_process_skip(conv->history_buffer, write_errors, conv->bit_writer, skip);
+        error_buffer_swap(conv->errors);
+    }
+}
+
+void _convolutional_decode_init(correct_convolutional *conv, unsigned int min_traceback,
+                                unsigned int traceback_length, unsigned int renormalize_interval) {
+    conv->has_init_decode = true;
+
+    conv->distances = calloc(1 << (conv->rate), sizeof(distance_t));
+    conv->pair_lookup = pair_lookup_create(conv->rate, conv->order, conv->table);
+
+    conv->soft_measurement = CORRECT_SOFT_LINEAR;
+
+    // we limit history to go back as far as 5 * the order of our polynomial
+    conv->history_buffer = history_buffer_create(min_traceback, traceback_length, renormalize_interval,
+                                                 conv->numstates / 2, 1 << (conv->order - 1));
+
+    conv->errors = error_buffer_create(conv->numstates);
+}
+
+static ssize_t _convolutional_decode(correct_convolutional *conv, size_t num_encoded_bits,
+                                     size_t num_encoded_bytes, uint8_t *msg,
+                                     const soft_t *soft_encoded) {
+    if (!conv->has_init_decode) {
+        uint64_t max_error_per_input = conv->rate * soft_max;
+        unsigned int renormalize_interval = distance_max / max_error_per_input;
+        _convolutional_decode_init(conv, 5 * conv->order, 15 * conv->order, renormalize_interval);
+    }
+
+    size_t sets = num_encoded_bits / conv->rate;
+    // XXX fix this vvvvvv
+    size_t decoded_len_bytes = num_encoded_bytes;
+    bit_writer_reconfigure(conv->bit_writer, msg, decoded_len_bytes);
+
+    error_buffer_reset(conv->errors);
+    history_buffer_reset(conv->history_buffer);
+
+    // no outputs are generated during warmup
+    convolutional_decode_warmup(conv, sets, soft_encoded);
+    convolutional_decode_inner(conv, sets, soft_encoded);
+    convolutional_decode_tail(conv, sets, soft_encoded);
+
+    history_buffer_flush(conv->history_buffer, conv->bit_writer);
+
+    return bit_writer_length(conv->bit_writer);
+}
+
+// perform viterbi decoding
+// hard decoder
+ssize_t correct_convolutional_decode(correct_convolutional *conv, const uint8_t *encoded,
+                                     size_t num_encoded_bits, uint8_t *msg) {
+    if (num_encoded_bits % conv->rate) {
+        // XXX turn this into an error code
+        // printf("encoded length of message must be a multiple of rate\n");
+        return -1;
+    }
+
+    size_t num_encoded_bytes =
+        (num_encoded_bits % 8) ? (num_encoded_bits / 8 + 1) : (num_encoded_bits / 8);
+    bit_reader_reconfigure(conv->bit_reader, encoded, num_encoded_bytes);
+
+    return _convolutional_decode(conv, num_encoded_bits, num_encoded_bytes, msg, NULL);
+}
+
+ssize_t correct_convolutional_decode_soft(correct_convolutional *conv, const soft_t *encoded,
+                                          size_t num_encoded_bits, uint8_t *msg) {
+    if (num_encoded_bits % conv->rate) {
+        // XXX turn this into an error code
+        // printf("encoded length of message must be a multiple of rate\n");
+        return -1;
+    }
+
+    size_t num_encoded_bytes =
+        (num_encoded_bits % 8) ? (num_encoded_bits / 8 + 1) : (num_encoded_bits / 8);
+
+    return _convolutional_decode(conv, num_encoded_bits, num_encoded_bytes, msg, encoded);
+}
--- a/core/libcorrect/src/convolutional/encode.c
+++ b/core/libcorrect/src/convolutional/encode.c
@ -0,0 +1,61 @@
+#include "correct/convolutional/convolutional.h"
+
+size_t correct_convolutional_encode_len(correct_convolutional *conv, size_t msg_len) {
+    size_t msgbits = 8 * msg_len;
+    size_t encodedbits = conv->rate * (msgbits + conv->order + 1);
+    return encodedbits;
+}
+
+// shift in most significant bit every time, one byte at a time
+// shift register takes most recent bit on right, shifts left
+// poly is written in same order, just & mask message w/ poly
+
+// assume that encoded length is long enough?
+size_t correct_convolutional_encode(correct_convolutional *conv,
+                                    const uint8_t *msg,
+                                    size_t msg_len,
+                                    uint8_t *encoded) {
+    // convolutional code convolves filter coefficients, given by
+    //     the polynomial, with some history from our message.
+    //     the history is stored as single subsequent bits in shiftregister
+    shift_register_t shiftregister = 0;
+
+    // shiftmask is the shiftregister bit mask that removes bits
+    //      that extend beyond order
+    // e.g. if order is 7, then remove the 8th bit and beyond
+    unsigned int shiftmask = (1 << conv->order) - 1;
+
+    size_t encoded_len_bits = correct_convolutional_encode_len(conv, msg_len);
+    size_t encoded_len = (encoded_len_bits % 8) ? (encoded_len_bits / 8 + 1) : (encoded_len_bits / 8);
+    bit_writer_reconfigure(conv->bit_writer, encoded, encoded_len);
+
+    bit_reader_reconfigure(conv->bit_reader, msg, msg_len);
+
+    for (size_t i = 0; i < 8 * msg_len; i++) {
+        // shiftregister has oldest bits on left, newest on right
+        shiftregister <<= 1;
+        shiftregister |= bit_reader_read(conv->bit_reader, 1);
+        shiftregister &= shiftmask;
+        // shift most significant bit from byte and move down one bit at a time
+
+        // we do direct lookup of our convolutional output here
+        // all of the bits from this convolution are stored in this row
+        unsigned int out = conv->table[shiftregister];
+        bit_writer_write(conv->bit_writer, out, conv->rate);
+    }
+
+    // now flush the shiftregister
+    // this is simply running the loop as above but without any new inputs
+    // or rather, the new input string is all 0s
+    for (size_t i = 0; i < conv->order + 1; i++) {
+        shiftregister <<= 1;
+        shiftregister &= shiftmask;
+        unsigned int out = conv->table[shiftregister];
+        bit_writer_write(conv->bit_writer, out, conv->rate);
+    }
+
+    // 0-fill any remaining bits on our final byte
+    bit_writer_flush_byte(conv->bit_writer);
+
+    return encoded_len_bits;
+}
--- a/core/libcorrect/src/convolutional/error_buffer.c
+++ b/core/libcorrect/src/convolutional/error_buffer.c
@ -0,0 +1,43 @@
+#include "correct/convolutional/error_buffer.h"
+
+error_buffer_t *error_buffer_create(unsigned int num_states) {
+    error_buffer_t *buf = calloc(1, sizeof(error_buffer_t));
+
+    // how large are the error buffers?
+    buf->num_states = num_states;
+
+    // save two error metrics, one for last round and one for this
+    // (double buffer)
+    // the error metric is the aggregated number of bit errors found
+    //   at a given path which terminates at a particular shift register state
+    buf->errors[0] = calloc(sizeof(distance_t), num_states);
+    buf->errors[1] = calloc(sizeof(distance_t), num_states);
+
+    // which buffer are we using, 0 or 1?
+    buf->index = 0;
+
+    buf->read_errors = buf->errors[0];
+    buf->write_errors = buf->errors[1];
+
+    return buf;
+}
+
+void error_buffer_destroy(error_buffer_t *buf) {
+    free(buf->errors[0]);
+    free(buf->errors[1]);
+    free(buf);
+}
+
+void error_buffer_reset(error_buffer_t *buf) {
+    memset(buf->errors[0], 0, buf->num_states * sizeof(distance_t));
+    memset(buf->errors[1], 0, buf->num_states * sizeof(distance_t));
+    buf->index = 0;
+    buf->read_errors = buf->errors[0];
+    buf->write_errors = buf->errors[1];
+}
+
+void error_buffer_swap(error_buffer_t *buf) {
+    buf->read_errors = buf->errors[buf->index];
+    buf->index = (buf->index + 1) % 2;
+    buf->write_errors = buf->errors[buf->index];
+}
--- a/core/libcorrect/src/convolutional/history_buffer.c
+++ b/core/libcorrect/src/convolutional/history_buffer.c
@ -0,0 +1,158 @@
+#include "correct/convolutional/history_buffer.h"
+
+history_buffer *history_buffer_create(unsigned int min_traceback_length,
+                                      unsigned int traceback_group_length,
+                                      unsigned int renormalize_interval, unsigned int num_states,
+                                      shift_register_t highbit) {
+    history_buffer *buf = calloc(1, sizeof(history_buffer));
+
+    *(unsigned int *)&buf->min_traceback_length = min_traceback_length;
+    *(unsigned int *)&buf->traceback_group_length = traceback_group_length;
+    *(unsigned int *)&buf->cap = min_traceback_length + traceback_group_length;
+    *(unsigned int *)&buf->num_states = num_states;
+    *(shift_register_t *)&buf->highbit = highbit;
+
+    buf->history = malloc(buf->cap * sizeof(uint8_t *));
+    for (unsigned int i = 0; i < buf->cap; i++) {
+        buf->history[i] = calloc(num_states, sizeof(uint8_t));
+    }
+    buf->fetched = malloc(buf->cap * sizeof(uint8_t));
+
+    buf->index = 0;
+    buf->len = 0;
+
+    buf->renormalize_counter = 0;
+    buf->renormalize_interval = renormalize_interval;
+
+    return buf;
+}
+
+void history_buffer_destroy(history_buffer *buf) {
+    for (unsigned int i = 0; i < buf->cap; i++) {
+        free(buf->history[i]);
+    }
+    free(buf->history);
+    free(buf->fetched);
+    free(buf);
+}
+
+void history_buffer_reset(history_buffer *buf) {
+    buf->len = 0;
+    buf->index = 0;
+}
+
+uint8_t *history_buffer_get_slice(history_buffer *buf) { return buf->history[buf->index]; }
+
+shift_register_t history_buffer_search(history_buffer *buf, const distance_t *distances,
+                                       unsigned int search_every) {
+    shift_register_t bestpath;
+    distance_t leasterror = USHRT_MAX;
+    // search for a state with the least error
+    for (shift_register_t state = 0; state < buf->num_states; state += search_every) {
+        if (distances[state] < leasterror) {
+            leasterror = distances[state];
+            bestpath = state;
+        }
+    }
+    return bestpath;
+}
+
+void history_buffer_renormalize(history_buffer *buf, distance_t *distances,
+                                shift_register_t min_register) {
+    distance_t min_distance = distances[min_register];
+    for (shift_register_t i = 0; i < buf->num_states; i++) {
+        distances[i] -= min_distance;
+    }
+}
+
+void history_buffer_traceback(history_buffer *buf, shift_register_t bestpath,
+                              unsigned int min_traceback_length, bit_writer_t *output) {
+    unsigned int fetched_index = 0;
+    shift_register_t highbit = buf->highbit;
+    unsigned int index = buf->index;
+    unsigned int cap = buf->cap;
+    for (unsigned int j = 0; j < min_traceback_length; j++) {
+        if (index == 0) {
+            index = cap - 1;
+        } else {
+            index--;
+        }
+        // we're walking backwards from what the work we did before
+        // so, we'll shift high order bits in
+        // the path will cross multiple different shift register states, and we determine
+        //   which state by going backwards one time slice at a time
+        uint8_t history = buf->history[index][bestpath];
+        shift_register_t pathbit = history ? highbit : 0;
+        bestpath |= pathbit;
+        bestpath >>= 1;
+    }
+    unsigned int prefetch_index = index;
+    if (prefetch_index == 0) {
+        prefetch_index = cap - 1;
+    } else {
+        prefetch_index--;
+    }
+    unsigned int len = buf->len;
+    for (unsigned int j = min_traceback_length; j < len; j++) {
+        index = prefetch_index;
+        if (prefetch_index == 0) {
+            prefetch_index = cap - 1;
+        } else {
+            prefetch_index--;
+        }
+        prefetch(buf->history[prefetch_index]);
+        // we're walking backwards from what the work we did before
+        // so, we'll shift high order bits in
+        // the path will cross multiple different shift register states, and we determine
+        //   which state by going backwards one time slice at a time
+        uint8_t history = buf->history[index][bestpath];
+        shift_register_t pathbit = history ? highbit : 0;
+        bestpath |= pathbit;
+        bestpath >>= 1;
+        buf->fetched[fetched_index] = (pathbit ? 1 : 0);
+        fetched_index++;
+    }
+    bit_writer_write_bitlist_reversed(output, buf->fetched, fetched_index);
+    buf->len -= fetched_index;
+}
+
+void history_buffer_process_skip(history_buffer *buf, distance_t *distances, bit_writer_t *output,
+                                 unsigned int skip) {
+    buf->index++;
+    if (buf->index == buf->cap) {
+        buf->index = 0;
+    }
+
+    buf->renormalize_counter++;
+    buf->len++;
+
+    // there are four ways these branches can resolve
+    // a) we are neither renormalizing nor doing a traceback
+    // b) we are renormalizing but not doing a traceback
+    // c) we are renormalizing and doing a traceback
+    // d) we are not renormalizing but we are doing a traceback
+    // in case c, we want to save the effort of finding the bestpath
+    //    since that's expensive
+    // so we have to check for that case after we renormalize
+    if (buf->renormalize_counter == buf->renormalize_interval) {
+        buf->renormalize_counter = 0;
+        shift_register_t bestpath = history_buffer_search(buf, distances, skip);
+        history_buffer_renormalize(buf, distances, bestpath);
+        if (buf->len == buf->cap) {
+            // reuse the bestpath found for renormalizing
+            history_buffer_traceback(buf, bestpath, buf->min_traceback_length, output);
+        }
+    } else if (buf->len == buf->cap) {
+        // not renormalizing, find the bestpath here
+        shift_register_t bestpath = history_buffer_search(buf, distances, skip);
+        history_buffer_traceback(buf, bestpath, buf->min_traceback_length, output);
+    }
+}
+
+void history_buffer_process(history_buffer *buf, distance_t *distances, bit_writer_t *output) {
+    history_buffer_process_skip(buf, distances, output, 1);
+}
+
+void history_buffer_flush(history_buffer *buf, bit_writer_t *output) {
+    history_buffer_traceback(buf, 0, 0, output);
+}
--- a/core/libcorrect/src/convolutional/lookup.c
+++ b/core/libcorrect/src/convolutional/lookup.c
@ -0,0 +1,74 @@
+#include "correct/convolutional/lookup.h"
+
+// table has numstates rows
+// each row contains all of the polynomial output bits concatenated together
+// e.g. for rate 2, we have 2 bits in each row
+// the first poly gets the LEAST significant bit, last poly gets most significant
+void fill_table(unsigned int rate,
+                unsigned int order,
+                const polynomial_t *poly,
+                unsigned int *table) {
+    for (shift_register_t i = 0; i < 1 << order; i++) {
+        unsigned int out = 0;
+        unsigned int mask = 1;
+        for (size_t j = 0; j < rate; j++) {
+            out |= (popcount(i & poly[j]) % 2) ? mask : 0;
+            mask <<= 1;
+        }
+        table[i] = out;
+    }
+}
+
+pair_lookup_t pair_lookup_create(unsigned int rate,
+                                 unsigned int order,
+                                 const unsigned int *table) {
+    pair_lookup_t pairs;
+
+    pairs.keys = malloc(sizeof(unsigned int) * (1 << (order - 1)));
+    pairs.outputs = calloc((1 << (rate * 2)), sizeof(unsigned int));
+    unsigned int *inv_outputs = calloc((1 << (rate * 2)), sizeof(unsigned int));
+    unsigned int output_counter = 1;
+    // for every (even-numbered) shift register state, find the concatenated output of the state
+    //   and the subsequent state that follows it (low bit set). then, check to see if this
+    //   concatenated output has a unique key assigned to it already. if not, give it a key.
+    //   if it does, retrieve the key. assign this key to the shift register state.
+    for (unsigned int i = 0; i < (1 << (order - 1)); i++) {
+        // first get the concatenated pair of outputs
+        unsigned int out = table[i * 2 + 1];
+        out <<= rate;
+        out |= table[i * 2];
+
+        // does this concatenated output exist in the outputs table yet?
+        if (!inv_outputs[out]) {
+            // doesn't exist, allocate a new key
+            inv_outputs[out] = output_counter;
+            pairs.outputs[output_counter] = out;
+            output_counter++;
+        }
+        // set the opaque key for the ith shift register state to the concatenated output entry
+        pairs.keys[i] = inv_outputs[out];
+    }
+    pairs.outputs_len = output_counter;
+    pairs.output_mask = (1 << (rate)) - 1;
+    pairs.output_width = rate;
+    pairs.distances = calloc(pairs.outputs_len, sizeof(distance_pair_t));
+    free(inv_outputs);
+    return pairs;
+}
+
+void pair_lookup_destroy(pair_lookup_t pairs) {
+    free(pairs.keys);
+    free(pairs.outputs);
+    free(pairs.distances);
+}
+
+void pair_lookup_fill_distance(pair_lookup_t pairs, distance_t *distances) {
+    for (unsigned int i = 1; i < pairs.outputs_len; i += 1) {
+        output_pair_t concat_out = pairs.outputs[i];
+        unsigned int i_0 = concat_out & pairs.output_mask;
+        concat_out >>= pairs.output_width;
+        unsigned int i_1 = concat_out;
+
+        pairs.distances[i] = (distances[i_1] << 16) | distances[i_0];
+    }
+}
--- a/core/libcorrect/src/convolutional/metric.c
+++ b/core/libcorrect/src/convolutional/metric.c
@ -0,0 +1,17 @@
+#include "correct/convolutional/metric.h"
+
+// measure the square of the euclidean distance between x and y
+// since euclidean dist is sqrt(a^2 + b^2 + ... + n^2), the square is just
+//    a^2 + b^2 + ... + n^2
+distance_t metric_soft_distance_quadratic(unsigned int hard_x, const uint8_t *soft_y, size_t len) {
+    distance_t dist = 0;
+    for (unsigned int i = 0; i < len; i++) {
+        // first, convert hard_x to a soft measurement (0 -> 0, 1 - > 255)
+        unsigned int soft_x = (hard_x & 1) ? 255 : 0;
+        hard_x >>= 1;
+        int d = soft_y[i] - soft_x;
+        dist += d*d;
+    }
+    return dist >> 3;
+}
+
--- a/core/libcorrect/src/convolutional/sse/CMakeLists.txt
+++ b/core/libcorrect/src/convolutional/sse/CMakeLists.txt
@ -0,0 +1,2 @@
+set(SRCFILES lookup.c convolutional.c encode.c decode.c)
+add_library(correct-convolutional-sse OBJECT ${SRCFILES})
--- a/core/libcorrect/src/convolutional/sse/convolutional.c
+++ b/core/libcorrect/src/convolutional/sse/convolutional.c
@ -0,0 +1,21 @@
+#include "correct/convolutional/sse/convolutional.h"
+
+correct_convolutional_sse *correct_convolutional_sse_create(size_t rate,
+                                                            size_t order,
+                                                            const polynomial_t *poly) {
+    correct_convolutional_sse *conv = malloc(sizeof(correct_convolutional_sse));
+    correct_convolutional *init_conv = _correct_convolutional_init(&conv->base_conv, rate, order, poly);
+    if (!init_conv) {
+        free(conv);
+        conv = NULL;
+    }
+    return conv;
+}
+
+void correct_convolutional_sse_destroy(correct_convolutional_sse *conv) {
+    if (conv->base_conv.has_init_decode) {
+        oct_lookup_destroy(conv->oct_lookup);
+    }
+    _correct_convolutional_teardown(&conv->base_conv);
+    free(conv);
+}
--- a/core/libcorrect/src/convolutional/sse/decode.c
+++ b/core/libcorrect/src/convolutional/sse/decode.c
@ -0,0 +1,319 @@
+#include "correct/convolutional/sse/convolutional.h"
+
+static void convolutional_sse_decode_inner(correct_convolutional_sse *sse_conv, unsigned int sets,
+                                           const uint8_t *soft) {
+    correct_convolutional *conv = &sse_conv->base_conv;
+    shift_register_t highbit = 1 << (conv->order - 1);
+    unsigned int hist_buf_index = conv->history_buffer->index;
+    unsigned int hist_buf_cap = conv->history_buffer->cap;
+    unsigned int hist_buf_len = conv->history_buffer->len;
+    unsigned int hist_buf_rn_int = conv->history_buffer->renormalize_interval;
+    unsigned int hist_buf_rn_cnt = conv->history_buffer->renormalize_counter;
+    for (unsigned int i = conv->order - 1; i < (sets - conv->order + 1); i++) {
+        distance_t *distances = conv->distances;
+        // lasterrors are the aggregate bit errors for the states of
+        // shiftregister for the previous time slice
+        if (soft) {
+            if (conv->soft_measurement == CORRECT_SOFT_LINEAR) {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_linear(j, soft + i * conv->rate, conv->rate);
+                }
+            } else {
+                for (unsigned int j = 0; j < 1 << (conv->rate); j++) {
+                    distances[j] =
+                        metric_soft_distance_quadratic(j, soft + i * conv->rate, conv->rate);
+                }
+            }
+        } else {
+            unsigned int out = bit_reader_read(conv->bit_reader, conv->rate);
+            for (unsigned int i = 0; i < 1 << (conv->rate); i++) {
+                distances[i] = metric_distance(i, out);
+            }
+        }
+        oct_lookup_t oct_lookup = sse_conv->oct_lookup;
+        oct_lookup_fill_distance(oct_lookup, distances);
+
+        // a mask to get the high order bit from the shift register
+        unsigned int num_iter = highbit << 1;
+        const distance_t *read_errors = conv->errors->read_errors;
+        // aggregate bit errors for this time slice
+        distance_t *write_errors = conv->errors->write_errors;
+
+        uint8_t *history = conv->history_buffer->history[hist_buf_index];
+        ;
+        // walk through all states, ignoring oldest bit
+        // we will track a best register state (path) and the number of bit
+        // errors at that path at this time slice
+        // this loop considers two paths per iteration (high order bit set,
+        // clear)
+        // so, it only runs numstates/2 iterations
+        // we'll update the history for every state and find the path with the
+        // least aggregated bit errors
+
+        // now run the main loop
+        // we calculate 2 sets of 2 register states here (4 states per iter)
+        // this creates 2 sets which share a predecessor, and 2 sets which share
+        // a successor
+        //
+        // the first set definition is the two states that are the same except
+        // for the least order bit
+        // these two share a predecessor because their high n - 1 bits are the
+        // same (differ only by newest bit)
+        //
+        // the second set definition is the two states that are the same except
+        // for the high order bit
+        // these two share a successor because the oldest high order bit will be
+        // shifted out, and the other bits will be present in the successor
+        //
+        shift_register_t highbase = highbit >> 1;
+        shift_register_t oct_highbase = highbase >> 2;
+        for (shift_register_t low = 0, high = highbit, base = 0, oct = 0; high < num_iter;
+             low += 32, high += 32, base += 16, oct += 4) {
+            // shifted-right ancestors
+            // low and low_plus_one share low_past_error
+            //   note that they are the same when shifted right by 1
+            // same goes for high and high_plus_one
+            __m128i past_shuffle_mask =
+                _mm_set_epi32(0x07060706, 0x05040504, 0x03020302, 0x01000100);
+            __m128i hist_mask =
+                _mm_set_epi32(0x80808080, 0x80808080, 0x0e0c0a09, 0x07050301);
+
+            // the loop below calculates 64 register states per loop iteration
+            // it does this by packing the 128-bit xmm registers with 8, 16-bit
+            // distances
+            // 4 of these registers hold distances for convolutional shift
+            // register states with the high bit cleared
+            //      and 4 hold distances for the corresponding shift register
+            //      states with the high bit set
+            // since each xmm register holds 8 distances, this adds up to a
+            // total of 8 * 8 = 64 shift register states
+            for (shift_register_t offset = 0, base_offset = 0; base_offset < 16;
+                 offset += 32, base_offset += 16) {
+                // load the past error for the register states with the high
+                // order bit cleared
+                __m128i low_past_error =
+                    _mm_loadl_epi64((const __m128i *)(read_errors + base + base_offset));
+                __m128i low_past_error0 =
+                    _mm_loadl_epi64((const __m128i *)(read_errors + base + base_offset + 4));
+                __m128i low_past_error1 =
+                    _mm_loadl_epi64((const __m128i *)(read_errors + base + base_offset + 8));
+                __m128i low_past_error2 =
+                    _mm_loadl_epi64((const __m128i *)(read_errors + base + base_offset + 12));
+
+                // shuffle the low past error
+                // register states that differ only by their low order bit share
+                // a past error
+                low_past_error = _mm_shuffle_epi8(low_past_error, past_shuffle_mask);
+                low_past_error0 = _mm_shuffle_epi8(low_past_error0, past_shuffle_mask);
+                low_past_error1 = _mm_shuffle_epi8(low_past_error1, past_shuffle_mask);
+                low_past_error2 = _mm_shuffle_epi8(low_past_error2, past_shuffle_mask);
+
+                // repeat past error lookup for register states with high order
+                // bit set
+                __m128i high_past_error =
+                    _mm_loadl_epi64((const __m128i *)(read_errors + highbase + base + base_offset));
+                __m128i high_past_error0 = _mm_loadl_epi64(
+                    (const __m128i *)(read_errors + highbase + base + base_offset + 4));
+                __m128i high_past_error1 = _mm_loadl_epi64(
+                    (const __m128i *)(read_errors + highbase + base + base_offset + 8));
+                __m128i high_past_error2 = _mm_loadl_epi64(
+                    (const __m128i *)(read_errors + highbase + base + base_offset + 12));
+
+                high_past_error = _mm_shuffle_epi8(high_past_error, past_shuffle_mask);
+                high_past_error0 = _mm_shuffle_epi8(high_past_error0, past_shuffle_mask);
+                high_past_error1 = _mm_shuffle_epi8(high_past_error1, past_shuffle_mask);
+                high_past_error2 = _mm_shuffle_epi8(high_past_error2, past_shuffle_mask);
+
+                // __m128i this_shuffle_mask = (__m128i){0x80800100, 0x80800302,
+                // 0x80800504, 0x80800706};
+
+                // load the opaque oct distance table keys from out loop index
+                distance_oct_key_t low_key = oct_lookup.keys[oct + (base_offset / 4)];
+                distance_oct_key_t low_key0 = oct_lookup.keys[oct + (base_offset / 4) + 1];
+                distance_oct_key_t low_key1 = oct_lookup.keys[oct + (base_offset / 4) + 2];
+                distance_oct_key_t low_key2 = oct_lookup.keys[oct + (base_offset / 4) + 3];
+
+                // load the distances for the register states with high order
+                // bit cleared
+                __m128i low_this_error =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + low_key));
+                __m128i low_this_error0 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + low_key0));
+                __m128i low_this_error1 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + low_key1));
+                __m128i low_this_error2 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + low_key2));
+
+                // add the distance for this time slice to the past distances
+                __m128i low_error = _mm_add_epi16(low_past_error, low_this_error);
+                __m128i low_error0 = _mm_add_epi16(low_past_error0, low_this_error0);
+                __m128i low_error1 = _mm_add_epi16(low_past_error1, low_this_error1);
+                __m128i low_error2 = _mm_add_epi16(low_past_error2, low_this_error2);
+
+                // repeat oct distance table lookup for registers with high
+                // order bit set
+                distance_oct_key_t high_key =
+                    oct_lookup.keys[oct_highbase + oct + (base_offset / 4)];
+                distance_oct_key_t high_key0 =
+                    oct_lookup.keys[oct_highbase + oct + (base_offset / 4) + 1];
+                distance_oct_key_t high_key1 =
+                    oct_lookup.keys[oct_highbase + oct + (base_offset / 4) + 2];
+                distance_oct_key_t high_key2 =
+                    oct_lookup.keys[oct_highbase + oct + (base_offset / 4) + 3];
+
+                __m128i high_this_error =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + high_key));
+                __m128i high_this_error0 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + high_key0));
+                __m128i high_this_error1 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + high_key1));
+                __m128i high_this_error2 =
+                    _mm_load_si128((const __m128i *)(oct_lookup.distances + high_key2));
+
+                __m128i high_error = _mm_add_epi16(high_past_error, high_this_error);
+                __m128i high_error0 = _mm_add_epi16(high_past_error0, high_this_error0);
+                __m128i high_error1 = _mm_add_epi16(high_past_error1, high_this_error1);
+                __m128i high_error2 = _mm_add_epi16(high_past_error2, high_this_error2);
+
+                // distances for this time slice calculated
+
+                // find the least error between registers who differ only in
+                // their high order bit
+                __m128i min_error = _mm_min_epu16(low_error, high_error);
+                __m128i min_error0 = _mm_min_epu16(low_error0, high_error0);
+                __m128i min_error1 = _mm_min_epu16(low_error1, high_error1);
+                __m128i min_error2 = _mm_min_epu16(low_error2, high_error2);
+
+                _mm_store_si128((__m128i *)(write_errors + low + offset), min_error);
+                _mm_store_si128((__m128i *)(write_errors + low + offset + 8), min_error0);
+                _mm_store_si128((__m128i *)(write_errors + low + offset + 16), min_error1);
+                _mm_store_si128((__m128i *)(write_errors + low + offset + 24), min_error2);
+
+                // generate history bits as (low_error > least_error)
+                // this operation fills each element with all 1s if true and 0s
+                // if false
+                // in other words, we set the history bit to 1 if
+                //      the register state with high order bit set was the least
+                //      error
+                __m128i hist = _mm_cmpgt_epi16(low_error, min_error);
+                // pack the bits down from 16-bit wide to 8-bit wide to
+                // accomodate history table
+                hist = _mm_shuffle_epi8(hist, hist_mask);
+
+                __m128i hist0 = _mm_cmpgt_epi16(low_error0, min_error0);
+                hist0 = _mm_shuffle_epi8(hist0, hist_mask);
+
+                __m128i hist1 = _mm_cmpgt_epi16(low_error1, min_error1);
+                hist1 = _mm_shuffle_epi8(hist1, hist_mask);
+
+                __m128i hist2 = _mm_cmpgt_epi16(low_error2, min_error2);
+                hist2 = _mm_shuffle_epi8(hist2, hist_mask);
+
+                // write the least error so that the next time slice sees it as
+                // the past error
+                // store the history bits set by cmp and shuffle operations
+                _mm_storel_epi64((__m128i *)(history + low + offset), hist);
+                _mm_storel_epi64((__m128i *)(history + low + offset + 8), hist0);
+                _mm_storel_epi64((__m128i *)(history + low + offset + 16), hist1);
+                _mm_storel_epi64((__m128i *)(history + low + offset + 24), hist2);
+            }
+        }
+
+        // bypass the call to history buffer
+        // we should really make that function inline and remove this below
+        if (hist_buf_len == hist_buf_cap - 1 || hist_buf_rn_cnt == hist_buf_rn_int - 1) {
+            // restore hist buffer state and invoke it
+            conv->history_buffer->len = hist_buf_len;
+            conv->history_buffer->index = hist_buf_index;
+            conv->history_buffer->renormalize_counter = hist_buf_rn_cnt;
+            history_buffer_process(conv->history_buffer, write_errors, conv->bit_writer);
+            // restore our local values
+            hist_buf_len = conv->history_buffer->len;
+            hist_buf_index = conv->history_buffer->index;
+            hist_buf_cap = conv->history_buffer->cap;
+            hist_buf_rn_cnt = conv->history_buffer->renormalize_counter;
+        } else {
+            hist_buf_len++;
+            hist_buf_index++;
+            if (hist_buf_index == hist_buf_cap) {
+                hist_buf_index = 0;
+            }
+            hist_buf_rn_cnt++;
+        }
+        error_buffer_swap(conv->errors);
+    }
+    conv->history_buffer->len = hist_buf_len;
+    conv->history_buffer->index = hist_buf_index;
+    conv->history_buffer->renormalize_counter = hist_buf_rn_cnt;
+}
+
+static void _convolutional_sse_decode_init(correct_convolutional_sse *conv,
+                                           unsigned int min_traceback,
+                                           unsigned int traceback_length,
+                                           unsigned int renormalize_interval) {
+    _convolutional_decode_init(&conv->base_conv, min_traceback, traceback_length,
+                               renormalize_interval);
+    conv->oct_lookup =
+        oct_lookup_create(conv->base_conv.rate, conv->base_conv.order, conv->base_conv.table);
+}
+
+static ssize_t _convolutional_sse_decode(correct_convolutional_sse *sse_conv,
+                                         size_t num_encoded_bits, size_t num_encoded_bytes,
+                                         uint8_t *msg, const soft_t *soft_encoded) {
+    correct_convolutional *conv = &sse_conv->base_conv;
+    if (!conv->has_init_decode) {
+        uint64_t max_error_per_input = conv->rate * soft_max;
+        // sse implementation unfortunately uses signed math on our unsigned values
+        // reduces usable distance by /2
+        unsigned int renormalize_interval = (distance_max / 2) / max_error_per_input;
+        _convolutional_sse_decode_init(sse_conv, 5 * conv->order, 100 * conv->order,
+                                       renormalize_interval);
+    }
+
+    size_t sets = num_encoded_bits / conv->rate;
+    // XXX fix this vvvvvv
+    size_t decoded_len_bytes = num_encoded_bytes;
+    bit_writer_reconfigure(conv->bit_writer, msg, decoded_len_bytes);
+
+    error_buffer_reset(conv->errors);
+    history_buffer_reset(conv->history_buffer);
+
+    // no outputs are generated during warmup
+    convolutional_decode_warmup(conv, sets, soft_encoded);
+    convolutional_sse_decode_inner(sse_conv, sets, soft_encoded);
+    convolutional_decode_tail(conv, sets, soft_encoded);
+
+    history_buffer_flush(conv->history_buffer, conv->bit_writer);
+
+    return bit_writer_length(conv->bit_writer);
+}
+
+ssize_t correct_convolutional_sse_decode(correct_convolutional_sse *conv, const uint8_t *encoded,
+                                         size_t num_encoded_bits, uint8_t *msg) {
+    if (num_encoded_bits % conv->base_conv.rate) {
+        // XXX turn this into an error code
+        // printf("encoded length of message must be a multiple of rate\n");
+        return -1;
+    }
+
+    size_t num_encoded_bytes =
+        (num_encoded_bits % 8) ? (num_encoded_bits / 8 + 1) : (num_encoded_bits / 8);
+    bit_reader_reconfigure(conv->base_conv.bit_reader, encoded, num_encoded_bytes);
+
+    return _convolutional_sse_decode(conv, num_encoded_bits, num_encoded_bytes, msg, NULL);
+}
+
+ssize_t correct_convolutional_sse_decode_soft(correct_convolutional_sse *conv, const soft_t *encoded,
+                                              size_t num_encoded_bits, uint8_t *msg) {
+    if (num_encoded_bits % conv->base_conv.rate) {
+        // XXX turn this into an error code
+        // printf("encoded length of message must be a multiple of rate\n");
+        return -1;
+    }
+
+    size_t num_encoded_bytes =
+        (num_encoded_bits % 8) ? (num_encoded_bits / 8 + 1) : (num_encoded_bits / 8);
+
+    return _convolutional_sse_decode(conv, num_encoded_bits, num_encoded_bytes, msg, encoded);
+}
--- a/core/libcorrect/src/convolutional/sse/encode.c
+++ b/core/libcorrect/src/convolutional/sse/encode.c
@ -0,0 +1,9 @@
+#include "correct/convolutional/sse/convolutional.h"
+
+size_t correct_convolutional_sse_encode_len(correct_convolutional_sse *conv, size_t msg_len) {
+    return correct_convolutional_encode_len(&conv->base_conv, msg_len);
+}
+
+size_t correct_convolutional_sse_encode(correct_convolutional_sse *conv, const uint8_t *msg, size_t msg_len, uint8_t *encoded) {
+    return correct_convolutional_encode(&conv->base_conv, msg, msg_len, encoded);
+}
--- a/core/libcorrect/src/convolutional/sse/lookup.c
+++ b/core/libcorrect/src/convolutional/sse/lookup.c
@ -0,0 +1,183 @@
+#include "correct/convolutional/sse/lookup.h"
+
+quad_lookup_t quad_lookup_create(unsigned int rate,
+                                 unsigned int order,
+                                 const unsigned int *table) {
+    quad_lookup_t quads;
+
+    quads.keys = malloc(sizeof(unsigned int) * (1 << (order - 2)));
+    quads.outputs = calloc((1 << (rate * 4)), sizeof(unsigned int));
+    unsigned int *inv_outputs = calloc((1 << (rate * 4)), sizeof(unsigned int));
+    unsigned int output_counter = 1;
+    // for every (even-numbered) shift register state, find the concatenated output of the state
+    //   and the subsequent state that follows it (low bit set). then, check to see if this
+    //   concatenated output has a unique key assigned to it already. if not, give it a key.
+    //   if it does, retrieve the key. assign this key to the shift register state.
+    for (unsigned int i = 0; i < (1 << (order - 2)); i++) {
+        // first get the concatenated quad of outputs
+        unsigned int out = table[i * 4 + 3];
+        out <<= rate;
+        out |= table[i * 4 + 2];
+        out <<= rate;
+        out |= table[i * 4 + 1];
+        out <<= rate;
+        out |= table[i * 4];
+
+        // does this concatenated output exist in the outputs table yet?
+        if (!inv_outputs[out]) {
+            // doesn't exist, allocate a new key
+            inv_outputs[out] = output_counter;
+            quads.outputs[output_counter] = out;
+            output_counter++;
+        }
+        // set the opaque key for the ith shift register state to the concatenated output entry
+        quads.keys[i] = inv_outputs[out];
+    }
+    quads.outputs_len = output_counter;
+    quads.output_mask = (1 << (rate)) - 1;
+    quads.output_width = rate;
+    quads.distances = calloc(quads.outputs_len, sizeof(distance_quad_t));
+    free(inv_outputs);
+    return quads;
+}
+
+void quad_lookup_destroy(quad_lookup_t quads) {
+    free(quads.keys);
+    free(quads.outputs);
+    free(quads.distances);
+}
+
+void quad_lookup_fill_distance(quad_lookup_t quads, distance_t *distances) {
+    for (unsigned int i = 1; i < quads.outputs_len; i += 1) {
+        output_quad_t concat_out = quads.outputs[i];
+        unsigned int i_0 = concat_out & quads.output_mask;
+        concat_out >>= quads.output_width;
+        unsigned int i_1 = concat_out & quads.output_mask;
+        concat_out >>= quads.output_width;
+        unsigned int i_2 = concat_out & quads.output_mask;
+        concat_out >>= quads.output_width;
+        unsigned int i_3 = concat_out;
+
+        quads.distances[i] = ((uint64_t)distances[i_3] << 48) | ((uint64_t)distances[i_2] << 32) | (distances[i_1] << 16) | distances[i_0];
+    }
+}
+
+distance_oct_key_t oct_lookup_find_key(output_oct_t *outputs, output_oct_t out, size_t num_keys) {
+    for (size_t i = 1; i < num_keys; i++) {
+        if (outputs[i] == out) {
+            return i;
+        }
+    }
+    return 0;
+}
+
+oct_lookup_t oct_lookup_create(unsigned int rate,
+                                 unsigned int order,
+                                 const unsigned int *table) {
+    oct_lookup_t octs;
+
+    octs.keys = malloc((1 << (order - 3)) * sizeof(distance_oct_key_t));
+    octs.outputs = malloc(((output_oct_t)2 << rate) * sizeof(uint64_t));
+    output_oct_t *short_outs = calloc(((output_oct_t)2 << rate), sizeof(output_oct_t));
+    size_t outputs_len = 2 << rate;
+    unsigned int output_counter = 1;
+    // for every (even-numbered) shift register state, find the concatenated output of the state
+    //   and the subsequent state that follows it (low bit set). then, check to see if this
+    //   concatenated output has a unique key assigned to it already. if not, give it a key.
+    //   if it does, retrieve the key. assign this key to the shift register state.
+    for (shift_register_t i = 0; i < (1 << (order - 3)); i++) {
+        // first get the concatenated oct of outputs
+        output_oct_t out = table[i * 8 + 7];
+        out <<= rate;
+        out |= table[i * 8 + 6];
+        out <<= rate;
+        out |= table[i * 8 + 5];
+        out <<= rate;
+        out |= table[i * 8 + 4];
+        out <<= rate;
+        out |= table[i * 8 + 3];
+        out <<= rate;
+        out |= table[i * 8 + 2];
+        out <<= rate;
+        out |= table[i * 8 + 1];
+        out <<= rate;
+        out |= table[i * 8];
+
+        distance_oct_key_t key = oct_lookup_find_key(short_outs, out, output_counter);
+        // does this concatenated output exist in the outputs table yet?
+        if (!key) {
+            // doesn't exist, allocate a new key
+            // now build it in expanded form
+            output_oct_t expanded_out = table[i * 8 + 7];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 6];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 5];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 4];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 3];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 2];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8 + 1];
+            expanded_out <<= 8;
+            expanded_out |= table[i * 8];
+
+            if (output_counter == outputs_len) {
+                octs.outputs = realloc(octs.outputs, outputs_len * 2 * sizeof(output_oct_t));
+                short_outs = realloc(short_outs, outputs_len * 2 * sizeof(output_oct_t));
+                outputs_len *= 2;
+            }
+            short_outs[output_counter] = out;
+            octs.outputs[output_counter] = expanded_out;
+            key = output_counter;
+            output_counter++;
+        }
+        // set the opaque key for the ith shift register state to the concatenated output entry
+        // we multiply the key by 2 since the distances are strided by 2
+        octs.keys[i] = key * 2;
+    }
+    free(short_outs);
+    octs.outputs_len = output_counter;
+    octs.output_mask = (1 << (rate)) - 1;
+    octs.output_width = rate;
+    octs.distances = malloc(octs.outputs_len * 2 * sizeof(uint64_t));
+    return octs;
+}
+
+void oct_lookup_destroy(oct_lookup_t octs) {
+    free(octs.keys);
+    free(octs.outputs);
+    free(octs.distances);
+}
+
+// WIP: sse approach to filling the distance table
+/*
+void oct_lookup_fill_distance_sse(oct_lookup_t octs, distance_t *distances) {
+    distance_pair_t *distance_pair = (distance_pair_t*)octs.distances;
+    __v4si index_shuffle_mask = (__v4si){0xffffff00, 0xffffff01, 0xffffff02, 0xffffff03};
+    __m256i dist_shuffle_mask = (__m256i){0x01000504, 0x09080d0c, 0xffffffff, 0xffffffff,
+                                          0x01000504, 0x09080d0c, 0xffffffff, 0xffffffff};
+    const int dist_permute_mask = 0x0c;
+    for (unsigned int i = 1; i < octs.outputs_len; i += 2) {
+        // big heaping todo vvv
+        // a) we want 16 bit distances GATHERed, not 32 bit
+        // b) we need to load 8 of those distances, not 4
+        __v4si short_concat_index = _mm_loadl_epi64(octs.outputs + 2*i);
+        __v4si short_concat_index0 = _mm_loadl_epi64(octs.outputs + 2*i + 1);
+        __m256i concat_index = _mm256_cvtepu8_epi32(short_concat_index);
+        __m256i concat_index0 = _mm256_cvtepu8_epi32(short_concat_index0);
+        __m256i dist = _mm256_i32gather_epi32(distances, concat_index, sizeof(distance_t));
+        __m256i dist0 = _mm256_i32gather_epi32(distances, concat_index0, sizeof(distance_t));
+        dist = _mm256_shuffle_epi8(dist, dist_shuffle_mask);
+        dist0 = _mm256_shuffle_epi8(dist0, dist_shuffle_mask);
+        dist = __builtin_shufflevector(dist, dist, 0, 5, 0, 0);
+        dist0 = __builtin_shufflevector(dist0, dist0, 0, 5, 0, 0);
+        __v4si packed_dist = _mm256_castsi256_si128(dist);
+        _mm_store_si128(distance_pair + 8 * i, packed_dist);
+        __v4si packed_dist0 = _mm256_castsi256_si128(dist0);
+        _mm_store_si128(distance_pair + 8 * i + 4, packed_dist0);
+    }
+}
+*/