DSP performance upgrades + bugfix

2025-10-21 13:19:22 +02:00 · 2021-03-29 21:53:43 +02:00
parent b72246d769
commit 27394a091f
29 changed files with 942 additions and 355 deletions
--- a/core/src/dsp/pll.h
+++ b/core/src/dsp/pll.h
@@ -1,185 +1,117 @@
 #pragma once
 #include <dsp/block.h>
 #include <dsp/interpolation_taps.h>
-
-#define DSP_SIGN(n)     ((n) >= 0)
-#define DSP_STEP(n)     (((n) > 0.0f) ? 1.0f : -1.0f)
+#include <math.h>
+#include <dsp/utils/macros.h>

 namespace dsp {
-    class SqSymbolRecovery : public generic_block<SqSymbolRecovery> {
+    template <int ORDER>
+    class CostasLoop: public generic_block<CostasLoop<ORDER>> {
    public:
-        SqSymbolRecovery() {}
+        CostasLoop() {}
+        CostasLoop(stream<complex_t>* in, float loopBandwidth) { init(in, loopBandwidth); }

-        SqSymbolRecovery(stream<float>* in, int omega) { init(in, omega); }
+        void init(stream<complex_t>* in, float loopBandwidth) {
+            _in = in;
+            lastVCO.re = 1.0f;
+            lastVCO.im = 0.0f;
+            _loopBandwidth = loopBandwidth;

-        ~SqSymbolRecovery() {
-            generic_block<SqSymbolRecovery>::stop();
+            float dampningFactor = sqrtf(2.0f) / 2.0f;
+            float denominator = (1.0 + 2.0 * dampningFactor * _loopBandwidth + _loopBandwidth * _loopBandwidth);
+            _alpha = (4 * dampningFactor * _loopBandwidth) / denominator;
+            _beta = (4 * _loopBandwidth * _loopBandwidth) / denominator;
+
+            generic_block<CostasLoop<ORDER>>::registerInput(_in);
+            generic_block<CostasLoop<ORDER>>::registerOutput(&out);
        }

-        void init(stream<float>* in, int omega) {
+        void setInput(stream<complex_t>* in) {
+            generic_block<CostasLoop<ORDER>>::tempStop();
+            generic_block<CostasLoop<ORDER>>::unregisterInput(_in);
            _in = in;
-            samplesPerSymbol = omega;
-            generic_block<SqSymbolRecovery>::registerInput(_in);
-            generic_block<SqSymbolRecovery>::registerOutput(&out);
+            generic_block<CostasLoop<ORDER>>::registerInput(_in);
+            generic_block<CostasLoop<ORDER>>::tempStart();
+        }
+
+        void setLoopBandwidth(float loopBandwidth) {
+            generic_block<CostasLoop<ORDER>>::tempStop();
+            _loopBandwidth = loopBandwidth;
+            float dampningFactor = sqrtf(2.0f) / 2.0f;
+            float denominator = (1.0 + 2.0 * dampningFactor * _loopBandwidth + _loopBandwidth * _loopBandwidth);
+            _alpha = (4 * dampningFactor * _loopBandwidth) / denominator;
+            _beta = (4 * _loopBandwidth * _loopBandwidth) / denominator;
+            generic_block<CostasLoop<ORDER>>::tempStart();
        }

        int run() {
-            count = _in->read();
+            int count = _in->read();
            if (count < 0) { return -1; }

-            int outCount = 0;
+            complex_t outVal;
+            float error;

            for (int i = 0; i < count; i++) {
-                if (DSP_SIGN(lastVal) != DSP_SIGN(_in->readBuf[i])) {
-                    counter = samplesPerSymbol / 2;
-                    lastVal = _in->readBuf[i];
-                    continue;
-                }

-                if (counter >= samplesPerSymbol) {
-                    counter = 0;
-                    out.writeBuf[outCount] = _in->readBuf[i];
-                    outCount++;
-                }
-                else {
-                    counter++;
-                }
+                // Mix the VFO with the input to create the output value
+                outVal = lastVCO * _in->readBuf[i];
+                out.writeBuf[i] = outVal;
+
+                // Calculate the phase error estimation
+                if constexpr (ORDER == 2) {
+                    error = outVal.re * outVal.im;
+                }
+                if constexpr (ORDER == 4) {
+                    error = (DSP_STEP(outVal.re) * outVal.im) - (DSP_STEP(outVal.im) * outVal.re);
+                }
+                if constexpr (ORDER == 8) {
+                    // This is taken from GR, I have no idea how it works but it does...
+                    const float K = (sqrtf(2.0) - 1);
+                    if (fabsf(outVal.re) >= fabsf(outVal.im)) {
+                        error = ((outVal.re > 0.0f ? 1.0f : -1.0f) * outVal.im -
+                                (outVal.im > 0.0f ? 1.0f : -1.0f) * outVal.re * K);
+                    } else {
+                        error = ((outVal.re > 0.0f ? 1.0f : -1.0f) * outVal.im * K -
+                                (outVal.im > 0.0f ? 1.0f : -1.0f) * outVal.re);
+                    }
+                }
+                
+                if (error > 1.0f) { error = 1.0f; }
+                if (error < -1.0f) { error = -1.0f; }
+                
+                // Integrate frequency and clamp it
+                vcoFrequency += _beta * error;
+                if (vcoFrequency > 1.0f) { vcoFrequency = 1.0f; }
+                if (vcoFrequency < -1.0f) { vcoFrequency = -1.0f; }
+
+                // Calculate new phase and wrap it
+                vcoPhase += vcoFrequency + (_alpha * error);
+                while (vcoPhase > (2.0f * FL_M_PI)) { vcoPhase -= (2.0f * FL_M_PI); }
+                while (vcoPhase < (-2.0f * FL_M_PI)) { vcoPhase += (2.0f * FL_M_PI); }
+
+                // Calculate output
+                lastVCO.re = cosf(vcoPhase);
+                lastVCO.im = sinf(vcoPhase);

-                lastVal = _in->readBuf[i];
            }
            
            _in->flush();
-            if (!out.swap(outCount)) { return -1; }
+            if (!out.swap(count)) { return -1; }
            return count;
        }

-        stream<float> out;
+        stream<complex_t> out;

    private:
-        int count;
-        int samplesPerSymbol = 1;
-        int counter = 0;
-        float lastVal = 0;
-        stream<float>* _in;
+        float _loopBandwidth = 1.0f;

-    };
+        float _alpha; // Integral coefficient
+        float _beta; // Proportional coefficient
+        float vcoFrequency = 0.0f;
+        float vcoPhase = 0.0f;
+        complex_t lastVCO;

-
-
-    class MMClockRecovery : public generic_block<MMClockRecovery> {
-    public:
-        MMClockRecovery() {}
-
-        MMClockRecovery(stream<float>* in, float omega, float gainOmega, float muGain, float omegaRelLimit) {
-            init(in, omega, gainOmega, muGain, omegaRelLimit);
-        }
-
-        ~MMClockRecovery() {
-            generic_block<MMClockRecovery>::stop();
-        }
-
-        void init(stream<float>* in, float omega, float gainOmega, float muGain, float omegaRelLimit) {
-            _in = in;
-            _omega = omega;
-            _muGain = muGain;
-            _gainOmega = gainOmega;
-            _omegaRelLimit = omegaRelLimit;
-
-            omegaMin = _omega - (_omega * _omegaRelLimit);
-            omegaMax = _omega + (_omega * _omegaRelLimit);
-            _dynOmega = _omega;
-
-            generic_block<MMClockRecovery>::registerInput(_in);
-            generic_block<MMClockRecovery>::registerOutput(&out);
-        }
-
-        int run() {
-            count = _in->read();
-            if (count < 0) { return -1; }
-
-            int outCount = 0;
-            float outVal;
-            float phaseError;
-            float roundedStep;
-            int maxOut = 2.0f * _omega * (float)count;
-
-            // Copy the first 7 values to the delay buffer for fast computing
-            memcpy(&delay[7], _in->readBuf, 7);
-
-            int i = nextOffset;
-            for (; i < count && outCount < maxOut;) {
-                // Calculate output value
-                // If we still need to use the old values, calculate using delay buf
-                // Otherwise, use normal buffer
-                if (i < 7) {
-                    volk_32f_x2_dot_prod_32f(&outVal, &delay[i], INTERP_TAPS[(int)roundf(_mu * 128.0f)], 8);
-                }
-                else {
-                    volk_32f_x2_dot_prod_32f(&outVal, &_in->readBuf[i - 7], INTERP_TAPS[(int)roundf(_mu * 128.0f)], 8);
-                }
-                out.writeBuf[outCount] = outVal;
-
-
-                // Cursed phase detect approximation (don't ask me how this approximation works)
-                phaseError = (DSP_STEP(lastOutput)*outVal) - (lastOutput*DSP_STEP(outVal));
-                lastOutput = outVal;
-                outCount++;
-
-                // Adjust the symbol rate using the phase error approximation and clamp
-                // TODO: Branchless clamp
-                _dynOmega = _dynOmega + (_gainOmega * phaseError);
-                if (_dynOmega > omegaMax) { _dynOmega = omegaMax; }
-                else if (_dynOmega < omegaMin) { _dynOmega = omegaMin; }
-
-                // Adjust the symbol phase according to the phase error approximation
-                // It will now contain the phase delta needed to jump to the next symbol
-                // Rounded step will contain the rounded number of symbols
-                _mu = _mu + _dynOmega + (_muGain * phaseError);
-                roundedStep = floor(_mu);
-
-                // Step to where the next symbol should be
-                i += (int)roundedStep;
-
-                // Now that we've stepped to the next symbol, keep only the offset inside the symbol
-                _mu -= roundedStep;
-            }
-
-            nextOffset = i - count;
-
-            // Save the last 7 values for the next round
-            memcpy(delay, &_in->readBuf[count - 7], 7);
-            
-            _in->flush();
-            if (!out.swap(outCount)) { return -1; }
-            return count;
-        }
-
-        stream<float> out;
-
-    private:
-        int count;
-
-        // Delay buffer
-        float delay[15];
-        int nextOffset = 0;
-
-        // Configuration
-        float _omega = 1.0f;
-        float _muGain = 1.0f;
-        float _gainOmega = 0.001f;
-        float _omegaRelLimit = 0.005;
-
-        // Precalculated values
-        float omegaMin = _omega + (_omega * _omegaRelLimit);
-        float omegaMax = _omega + (_omega * _omegaRelLimit);
-
-        // Runtime adjusted
-        float _dynOmega = _omega;
-        float _mu = 0.5f;
-        float lastOutput = 0.0f;
-        
-
-        stream<float>* _in;
+        stream<complex_t>* _in;

    };
 }