7 files changed, 378 insertions, 269 deletions
diff --git a/host/examples/CMakeLists.txt b/host/examples/CMakeLists.txt
index fe9e6409e..10d1fddc3 100644
--- a/host/examples/CMakeLists.txt
+++ b/host/examples/CMakeLists.txt
@@ -19,7 +19,7 @@
 # example applications
 ########################################################################
 SET(example_sources
-    benchmark_rx_rate.cpp
+    benchmark_rate.cpp
     rx_multi_samples.cpp
     rx_samples_to_file.cpp
     rx_samples_to_udp.cpp
diff --git a/host/examples/benchmark_rate.cpp b/host/examples/benchmark_rate.cpp
new file mode 100644
index 000000000..6927b512b
--- /dev/null
+++ b/host/examples/benchmark_rate.cpp
@@ -0,0 +1,236 @@
+//
+// Copyright 2011 Ettus Research LLC
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+//
+
+#include <uhd/utils/thread_priority.hpp>
+#include <uhd/utils/safe_main.hpp>
+#include <uhd/usrp/multi_usrp.hpp>
+#include <boost/program_options.hpp>
+#include <boost/format.hpp>
+#include <boost/thread/thread.hpp>
+#include <boost/math/special_functions/round.hpp>
+#include <iostream>
+#include <complex>
+
+namespace po = boost::program_options;
+
+unsigned long long num_overflows = 0;
+unsigned long long num_underflows = 0;
+unsigned long long num_rx_samps = 0;
+unsigned long long num_tx_samps = 0;
+unsigned long long num_dropped_samps = 0;
+
+/***********************************************************************
+ * Benchmark RX Rate
+ **********************************************************************/
+void benchmark_rx_rate(uhd::usrp::multi_usrp::sptr usrp){
+    uhd::set_thread_priority_safe();
+
+    //print pre-test summary
+    std::cout << boost::format(
+        "Testing receive rate %f Msps"
+    ) % (usrp->get_rx_rate()/1e6) << std::endl;
+
+    //setup variables and allocate buffer
+    uhd::rx_metadata_t md;
+    const size_t max_samps_per_packet = usrp->get_device()->get_max_recv_samps_per_packet();
+    std::vector<std::complex<float> > buff(max_samps_per_packet);
+    bool had_an_overflow = false;
+    uhd::time_spec_t last_time;
+    const double rate = usrp->get_rx_rate();
+
+    usrp->issue_stream_cmd(uhd::stream_cmd_t::STREAM_MODE_START_CONTINUOUS);
+    while (not boost::this_thread::interruption_requested()){
+        num_rx_samps += usrp->get_device()->recv(
+            &buff.front(), buff.size(), md,
+            uhd::io_type_t::COMPLEX_FLOAT32,
+            uhd::device::RECV_MODE_ONE_PACKET
+        );
+
+        //handle the error codes
+        switch(md.error_code){
+        case uhd::rx_metadata_t::ERROR_CODE_NONE:
+            if (had_an_overflow){
+                had_an_overflow = false;
+                num_dropped_samps += boost::math::iround((md.time_spec - last_time).get_real_secs()*rate);
+            }
+            break;
+
+        case uhd::rx_metadata_t::ERROR_CODE_OVERFLOW:
+            had_an_overflow = true;
+            last_time = md.time_spec;
+            num_overflows++;
+            break;
+
+        default:
+            std::cerr << "Error code: " << md.error_code << std::endl;
+            std::cerr << "Unexpected error on recv, exit test..." << std::endl;
+            goto loop_done;
+        }
+
+    } loop_done:
+    usrp->issue_stream_cmd(uhd::stream_cmd_t::STREAM_MODE_STOP_CONTINUOUS);
+}
+
+/***********************************************************************
+ * Benchmark TX Rate
+ **********************************************************************/
+void benchmark_tx_rate(uhd::usrp::multi_usrp::sptr usrp){
+    uhd::set_thread_priority_safe();
+
+    //print pre-test summary
+    std::cout << boost::format(
+        "Testing transmit rate %f Msps"
+    ) % (usrp->get_tx_rate()/1e6) << std::endl;
+
+    //setup variables and allocate buffer
+    uhd::tx_metadata_t md;
+    md.has_time_spec = false;
+    const size_t max_samps_per_packet = usrp->get_device()->get_max_send_samps_per_packet();
+    std::vector<std::complex<float> > buff(max_samps_per_packet);
+
+    while (not boost::this_thread::interruption_requested()){
+        num_tx_samps += usrp->get_device()->send(
+            &buff.front(), buff.size(), md,
+            uhd::io_type_t::COMPLEX_FLOAT32,
+            uhd::device::SEND_MODE_ONE_PACKET
+        );
+    }
+
+    //send a mini EOB packet
+    md.end_of_burst   = true;
+    usrp->get_device()->send("", 0, md,
+        uhd::io_type_t::COMPLEX_FLOAT32,
+        uhd::device::SEND_MODE_FULL_BUFF
+    );
+}
+
+void benchmark_tx_rate_async_helper(uhd::usrp::multi_usrp::sptr usrp){
+    //setup variables and allocate buffer
+    uhd::async_metadata_t async_md;
+
+    while (true){
+
+        if (not usrp->get_device()->recv_async_msg(async_md)){
+            if (boost::this_thread::interruption_requested()) return;
+        }
+
+        //handle the error codes
+        switch(async_md.event_code){
+        case uhd::async_metadata_t::EVENT_CODE_BURST_ACK:
+            return;
+
+        case uhd::async_metadata_t::EVENT_CODE_UNDERFLOW:
+        case uhd::async_metadata_t::EVENT_CODE_UNDERFLOW_IN_PACKET:
+            num_underflows++;
+            break;
+
+        default:
+            std::cerr << "Event code: " << async_md.event_code << std::endl;
+            std::cerr << "Unexpected event on async recv, exit test..." << std::endl;
+            return;
+        }
+    }
+}
+
+/***********************************************************************
+ * Main code + dispatcher
+ **********************************************************************/
+int UHD_SAFE_MAIN(int argc, char *argv[]){
+
+    //variables to be set by po
+    std::string args;
+    double duration;
+    double rx_rate, tx_rate;
+
+    //setup the program options
+    po::options_description desc("Allowed options");
+    desc.add_options()
+        ("help", "help message")
+        ("args", po::value<std::string>(&args)->default_value(""), "single uhd device address args")
+        ("duration", po::value<double>(&duration)->default_value(10.0), "duration for the test in seconds")
+        ("rx_rate", po::value<double>(&rx_rate), "specify to perform a RX rate test (sps)")
+        ("tx_rate", po::value<double>(&tx_rate), "specify to perform a TX rate test (sps)")
+    ;
+    po::variables_map vm;
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+
+    //print the help message
+    if (vm.count("help")){
+        std::cout << boost::format("UHD Benchmark Rate %s") % desc << std::endl;
+        std::cout <<
+        "    Specify --rx_rate for a receive-only test.\n"
+        "    Specify --tx_rate for a transmit-only test.\n"
+        "    Specify both options for a full-duplex test.\n"
+        << std::endl;
+        return ~0;
+    }
+
+    //create a usrp device
+    std::cout << std::endl;
+    uhd::device_addrs_t device_addrs = uhd::device::find(args);
+    if (device_addrs.empty()){
+        std::cerr << "Could not find any devices for: " << args << std::endl;
+        return ~0;
+    }
+    if (device_addrs.at(0).get("type", "") == "usrp1"){
+        std::cerr << "*** Warning! ***" << std::endl;
+        std::cerr << "Benchmark results will be inaccurate on USRP1 due to insufficient features.\n" << std::endl;
+    }
+    std::cout << boost::format("Creating the usrp device with: %s...") % args << std::endl;
+    uhd::usrp::multi_usrp::sptr usrp = uhd::usrp::multi_usrp::make(device_addrs.at(0));
+    std::cout << boost::format("Using Device: %s") % usrp->get_pp_string() << std::endl;
+
+    boost::thread_group thread_group;
+
+    //spawn the receive test thread
+    if (vm.count("rx_rate")){
+        usrp->set_rx_rate(rx_rate);
+        thread_group.create_thread(boost::bind(&benchmark_rx_rate, usrp));
+    }
+
+    //spawn the transmit test thread
+    if (vm.count("tx_rate")){
+        usrp->set_tx_rate(tx_rate);
+        thread_group.create_thread(boost::bind(&benchmark_tx_rate, usrp));
+        thread_group.create_thread(boost::bind(&benchmark_tx_rate_async_helper, usrp));
+    }
+
+    //sleep for the required duration
+    const long secs = long(duration);
+    const long usecs = long((duration - secs)*1e6);
+    boost::this_thread::sleep(boost::posix_time::seconds(secs) + boost::posix_time::microseconds(usecs));
+
+    //interrupt and join the threads
+    thread_group.interrupt_all();
+    thread_group.join_all();
+
+    //print summary
+    std::cout << std::endl << boost::format(
+        "Benchmark rate summary:\n"
+        "  Num received samples:    %u\n"
+        "  Num dropped samples:     %u\n"
+        "  Num overflows detected:  %u\n"
+        "  Num transmitted samples: %u\n"
+        "  Num underflows detected: %u\n"
+    ) % num_rx_samps % num_dropped_samps % num_overflows % num_tx_samps % num_underflows << std::endl;
+
+    //finished
+    std::cout << std::endl << "Done!" << std::endl << std::endl;
+
+    return 0;
+}
diff --git a/host/examples/benchmark_rx_rate.cpp b/host/examples/benchmark_rx_rate.cpp
deleted file mode 100644
index 50af1b98b..000000000
--- a/host/examples/benchmark_rx_rate.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-//
-// Copyright 2010-2011 Ettus Research LLC
-//
-// This program is free software: you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation, either version 3 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program.  If not, see <http://www.gnu.org/licenses/>.
-//
-
-#include <uhd/utils/thread_priority.hpp>
-#include <uhd/utils/safe_main.hpp>
-#include <uhd/usrp/multi_usrp.hpp>
-#include <boost/math/special_functions/round.hpp>
-#include <boost/program_options.hpp>
-#include <boost/format.hpp>
-#include <iostream>
-#include <complex>
-
-namespace po = boost::program_options;
-
-static inline void test_device(
-    uhd::usrp::multi_usrp::sptr usrp,
-    double rx_rate_sps,
-    double duration_secs
-){
-    const size_t max_samps_per_packet = usrp->get_device()->get_max_recv_samps_per_packet();
-    std::cout << boost::format("Testing receive rate %f Msps (%f second run)") % (rx_rate_sps/1e6) % duration_secs << std::endl;
-
-    //allocate recv buffer and metatdata
-    uhd::rx_metadata_t md;
-    std::vector<std::complex<float> > buff(max_samps_per_packet);
-
-    //flush the buffers in the recv path
-    while(usrp->get_device()->recv(
-        &buff.front(), buff.size(), md,
-        uhd::io_type_t::COMPLEX_FLOAT32,
-        uhd::device::RECV_MODE_ONE_PACKET
-    )){
-        /* NOP */
-    };
-
-    //declare status variables
-    bool got_first_packet = false;
-    size_t total_recv_packets = 0;
-    size_t total_lost_samples = 0;
-    size_t total_recv_samples = 0;
-    uhd::time_spec_t initial_time_spec;
-    uhd::time_spec_t next_expected_time_spec;
-
-    usrp->issue_stream_cmd(uhd::stream_cmd_t::STREAM_MODE_START_CONTINUOUS);
-    do {
-        size_t num_rx_samps = usrp->get_device()->recv(
-            &buff.front(), buff.size(), md,
-            uhd::io_type_t::COMPLEX_FLOAT32,
-            uhd::device::RECV_MODE_ONE_PACKET
-        );
-
-        //handle the error codes
-        switch(md.error_code){
-        case uhd::rx_metadata_t::ERROR_CODE_NONE:
-        case uhd::rx_metadata_t::ERROR_CODE_OVERFLOW:
-            break;
-
-        default:
-            std::cerr << "Error code: " << md.error_code << std::endl;
-            std::cerr << "Unexpected error on recv, exit test..." << std::endl;
-            return;
-        }
-
-        if (not md.has_time_spec){
-            std::cerr << "Metadata missing time spec, exit test..." << std::endl;
-            return;
-        }
-
-        total_recv_samples += num_rx_samps;
-        total_recv_packets++;
-
-        if (not got_first_packet){
-            initial_time_spec = md.time_spec;
-            next_expected_time_spec = initial_time_spec;
-            got_first_packet = true;
-        }
-
-        double approx_lost_samps = rx_rate_sps*(md.time_spec - next_expected_time_spec).get_real_secs();
-        total_lost_samples += std::max(0, boost::math::iround(approx_lost_samps));
-        next_expected_time_spec = md.time_spec + uhd::time_spec_t(0, num_rx_samps, rx_rate_sps);
-
-    } while((next_expected_time_spec - initial_time_spec) < uhd::time_spec_t(duration_secs));
-    usrp->issue_stream_cmd(uhd::stream_cmd_t::STREAM_MODE_STOP_CONTINUOUS);
-
-    //print a summary
-    std::cout << std::endl; //go to newline, recv may spew SXSYSZ...
-    std::cout << boost::format("    Received packets: %d") % total_recv_packets << std::endl;
-    std::cout << boost::format("    Received samples: %d") % total_recv_samples << std::endl;
-    std::cout << boost::format("    Lost samples: %d") % total_lost_samples << std::endl;
-    size_t packets_lost = boost::math::iround(double(total_lost_samples)/max_samps_per_packet);
-    std::cout << boost::format("    Lost packets: %d (approximate)") % packets_lost << std::endl;
-    double actual_rx_rate_sps = (total_recv_samples*rx_rate_sps)/(total_recv_samples+total_lost_samples);
-    std::cout << boost::format("    Sustained receive rate: %f Msps") % (actual_rx_rate_sps/1e6) << std::endl;
-    std::cout << std::endl << std::endl;
-}
-
-int UHD_SAFE_MAIN(int argc, char *argv[]){
-    uhd::set_thread_priority_safe();
-
-    //variables to be set by po
-    std::string args;
-    double duration;
-    double rate;
-
-    //setup the program options
-    po::options_description desc("Allowed options");
-    desc.add_options()
-        ("help", "help message")
-        ("args", po::value<std::string>(&args)->default_value(""), "single uhd device address args")
-        ("duration", po::value<double>(&duration)->default_value(10.0), "duration for each test in seconds")
-        ("rate", po::value<double>(&rate), "specify to perform a single test as this rate (sps)")
-    ;
-    po::variables_map vm;
-    po::store(po::parse_command_line(argc, argv, desc), vm);
-    po::notify(vm);
-
-    //print the help message
-    if (vm.count("help")){
-        std::cout << boost::format("UHD Benchmark RX Rate %s") % desc << std::endl;
-        return ~0;
-    }
-
-    //verify that rate was specified
-    if (not vm.count("rate")){
-        std::cerr << "Please specify the sample rate with --rate" << std::endl;
-        return ~0;
-    }
-
-    //create a usrp device
-    std::cout << std::endl;
-    uhd::device_addrs_t device_addrs = uhd::device::find(args);
-    if (device_addrs.empty()){
-        std::cerr << "Could not find any devices for: " << args << std::endl;
-        return ~0;
-    }
-    if (device_addrs.at(0).get("type", "") == "usrp1"){
-        std::cerr << "*** Warning! ***" << std::endl;
-        std::cerr << "Benchmark RX results will be inaccurate on USRP1 due to soft-time control.\n" << std::endl;
-    }
-    std::cout << boost::format("Creating the usrp device with: %s...") % args << std::endl;
-    uhd::usrp::multi_usrp::sptr usrp = uhd::usrp::multi_usrp::make(device_addrs.at(0));
-    std::cout << boost::format("Using Device: %s") % usrp->get_pp_string() << std::endl;
-
-    //start the test
-    usrp->set_rx_rate(rate);
-    rate = usrp->get_rx_rate();
-    test_device(usrp, rate, duration);
-
-    //finished
-    std::cout << std::endl << "Done!" << std::endl << std::endl;
-
-    return 0;
-}
diff --git a/host/lib/convert/convert_with_sse2.cpp b/host/lib/convert/convert_with_sse2.cpp
index 52beea24a..9772028dc 100644
--- a/host/lib/convert/convert_with_sse2.cpp
+++ b/host/lib/convert/convert_with_sse2.cpp
@@ -25,25 +25,37 @@ DECLARE_CONVERTER(convert_fc32_1_to_item32_1_nswap, PRIORITY_CUSTOM){
     const fc32_t *input = reinterpret_cast<const fc32_t *>(inputs[0]);
     item32_t *output = reinterpret_cast<item32_t *>(outputs[0]);
 
-    __m128 scalar = _mm_set_ps1(float(scale_factor));
-
-    //convert blocks of samples with intrinsics
-    size_t i = 0; for (; i < (nsamps & ~0x3); i+=4){
-        //load from input
-        __m128 tmplo = _mm_loadu_ps(reinterpret_cast<const float *>(input+i+0));
-        __m128 tmphi = _mm_loadu_ps(reinterpret_cast<const float *>(input+i+2));
-
-        //convert and scale
-        __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar));
-        __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar));
-
-        //pack + swap 16-bit pairs
-        __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi);
-        tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));
-        tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));
-
-        //store to output
-        _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi);
+    const __m128 scalar = _mm_set_ps1(float(scale_factor));
+
+    #define convert_fc32_1_to_item32_1_nswap_guts(_al_)                 \
+    for (; i < (nsamps & ~0x3); i+=4){                                  \
+        /* load from input */                                           \
+        __m128 tmplo = _mm_load ## _al_ ## _ps(reinterpret_cast<const float *>(input+i+0)); \
+        __m128 tmphi = _mm_load ## _al_ ## _ps(reinterpret_cast<const float *>(input+i+2)); \
+                                                                        \
+        /* convert and scale */ \
+        __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar));    \
+        __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar));    \
+                                                                        \
+        /* pack + swap 16-bit pairs */                                  \
+        __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi);                 \
+        tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));      \
+        tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));      \
+                                                                        \
+        /* store to output */                                           \
+        _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi);  \
+    }                                                                   \
+
+    size_t i = 0;
+
+    //dispatch according to alignment
+    switch (size_t(input) & 0xf){
+    case 0x8:
+        output[i] = fc32_to_item32(input[i], float(scale_factor)); i++;
+    case 0x0:
+        convert_fc32_1_to_item32_1_nswap_guts()
+        break;
+    default: convert_fc32_1_to_item32_1_nswap_guts(u)
     }
 
     //convert remainder
@@ -56,24 +68,36 @@ DECLARE_CONVERTER(convert_fc32_1_to_item32_1_bswap, PRIORITY_CUSTOM){
     const fc32_t *input = reinterpret_cast<const fc32_t *>(inputs[0]);
     item32_t *output = reinterpret_cast<item32_t *>(outputs[0]);
 
-    __m128 scalar = _mm_set_ps1(float(scale_factor));
-
-    //convert blocks of samples with intrinsics
-    size_t i = 0; for (; i < (nsamps & ~0x3); i+=4){
-        //load from input
-        __m128 tmplo = _mm_loadu_ps(reinterpret_cast<const float *>(input+i+0));
-        __m128 tmphi = _mm_loadu_ps(reinterpret_cast<const float *>(input+i+2));
-
-        //convert and scale
-        __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar));
-        __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar));
-
-        //pack + byteswap -> byteswap 16 bit words
-        __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi);
-        tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8));
-
-        //store to output
-        _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi);
+    const __m128 scalar = _mm_set_ps1(float(scale_factor));
+
+    #define convert_fc32_1_to_item32_1_bswap_guts(_al_)                 \
+    for (; i < (nsamps & ~0x3); i+=4){                                  \
+        /* load from input */                                           \
+        __m128 tmplo = _mm_load ## _al_ ## _ps(reinterpret_cast<const float *>(input+i+0)); \
+        __m128 tmphi = _mm_load ## _al_ ## _ps(reinterpret_cast<const float *>(input+i+2)); \
+                                                                        \
+        /* convert and scale */ \
+        __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar));    \
+        __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar));    \
+                                                                        \
+        /* pack + byteswap -> byteswap 16 bit words */                  \
+        __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi);                 \
+        tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \
+                                                                        \
+        /* store to output */                                           \
+        _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi);  \
+    }                                                                   \
+
+    size_t i = 0;
+
+    //dispatch according to alignment
+    switch (size_t(input) & 0xf){
+    case 0x8:
+        output[i] = uhd::byteswap(fc32_to_item32(input[i], float(scale_factor))); i++;
+    case 0x0:
+        convert_fc32_1_to_item32_1_bswap_guts()
+        break;
+    default: convert_fc32_1_to_item32_1_bswap_guts(u)
     }
 
     //convert remainder
@@ -86,27 +110,39 @@ DECLARE_CONVERTER(convert_item32_1_to_fc32_1_nswap, PRIORITY_CUSTOM){
     const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]);
     fc32_t *output = reinterpret_cast<fc32_t *>(outputs[0]);
 
-    __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 16));
-    __m128i zeroi = _mm_setzero_si128();
-
-    //convert blocks of samples with intrinsics
-    size_t i = 0; for (; i < (nsamps & ~0x3); i+=4){
-        //load from input
-        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i));
-
-        //unpack + swap 16-bit pairs
-        tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));
-        tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));
-        __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); //value in upper 16 bits
-        __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi);
-
-        //convert and scale
-        __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar);
-        __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar);
-
-        //store to output
-        _mm_storeu_ps(reinterpret_cast<float *>(output+i+0), tmplo);
-        _mm_storeu_ps(reinterpret_cast<float *>(output+i+2), tmphi);
+    const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 16));
+    const __m128i zeroi = _mm_setzero_si128();
+
+    #define convert_item32_1_to_fc32_1_nswap_guts(_al_)                 \
+    for (; i < (nsamps & ~0x3); i+=4){                                  \
+        /* load from input */                                           \
+        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \
+                                                                        \
+        /* unpack + swap 16-bit pairs */                                \
+        tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));      \
+        tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1));      \
+        __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \
+        __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi);               \
+                                                                        \
+        /* convert and scale */                                         \
+        __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar);     \
+        __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar);     \
+                                                                        \
+        /* store to output */                                           \
+        _mm_store ## _al_ ## _ps(reinterpret_cast<float *>(output+i+0), tmplo); \
+        _mm_store ## _al_ ## _ps(reinterpret_cast<float *>(output+i+2), tmphi); \
+    }                                                                   \
+
+    size_t i = 0;
+
+    //dispatch according to alignment
+    switch (size_t(output) & 0xf){
+    case 0x8:
+        output[i] = item32_to_fc32(input[i], float(scale_factor)); i++;
+    case 0x0:
+        convert_item32_1_to_fc32_1_nswap_guts()
+        break;
+    default: convert_item32_1_to_fc32_1_nswap_guts(u)
     }
 
     //convert remainder
@@ -119,26 +155,38 @@ DECLARE_CONVERTER(convert_item32_1_to_fc32_1_bswap, PRIORITY_CUSTOM){
     const item32_t *input = reinterpret_cast<const item32_t *>(inputs[0]);
     fc32_t *output = reinterpret_cast<fc32_t *>(outputs[0]);
 
-    __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 16));
-    __m128i zeroi = _mm_setzero_si128();
-
-    //convert blocks of samples with intrinsics
-    size_t i = 0; for (; i < (nsamps & ~0x3); i+=4){
-        //load from input
-        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i));
-
-        //byteswap + unpack -> byteswap 16 bit words
-        tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8));
-        __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); //value in upper 16 bits
-        __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi);
-
-        //convert and scale
-        __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar);
-        __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar);
-
-        //store to output
-        _mm_storeu_ps(reinterpret_cast<float *>(output+i+0), tmplo);
-        _mm_storeu_ps(reinterpret_cast<float *>(output+i+2), tmphi);
+    const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 16));
+    const __m128i zeroi = _mm_setzero_si128();
+
+    #define convert_item32_1_to_fc32_1_bswap_guts(_al_)                 \
+    for (; i < (nsamps & ~0x3); i+=4){                                  \
+        /* load from input */                                           \
+        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i)); \
+                                                                        \
+        /* byteswap + unpack -> byteswap 16 bit words */                \
+        tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \
+        __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \
+        __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi);               \
+                                                                        \
+        /* convert and scale */                                         \
+        __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar);     \
+        __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar);     \
+                                                                        \
+        /* store to output */                                           \
+        _mm_store ## _al_ ## _ps(reinterpret_cast<float *>(output+i+0), tmplo); \
+        _mm_store ## _al_ ## _ps(reinterpret_cast<float *>(output+i+2), tmphi); \
+    }                                                                   \
+
+    size_t i = 0;
+
+    //dispatch according to alignment
+    switch (size_t(output) & 0xf){
+    case 0x8:
+        output[i] = item32_to_fc32(uhd::byteswap(input[i]), float(scale_factor)); i++;
+    case 0x0:
+        convert_item32_1_to_fc32_1_bswap_guts()
+        break;
+    default: convert_item32_1_to_fc32_1_bswap_guts(u)
     }
 
     //convert remainder
diff --git a/host/lib/transport/libusb1_zero_copy.cpp b/host/lib/transport/libusb1_zero_copy.cpp
index 19a7a3742..f781f890d 100644
--- a/host/lib/transport/libusb1_zero_copy.cpp
+++ b/host/lib/transport/libusb1_zero_copy.cpp
@@ -222,7 +222,6 @@ public:
             }
         }
         //shutdown the threads
-        _threads_running = false;
         _thread_group.interrupt_all();
         _thread_group.join_all();
     }
@@ -277,15 +276,13 @@ private:
 
     //! event handler threads
     boost::thread_group _thread_group;
-    bool _threads_running;
 
     void run_event_loop(boost::barrier &spawn_barrier){
-        _threads_running = true;
         spawn_barrier.wait();
         set_thread_priority_safe();
         libusb_context *context = libusb::session::get_global_session()->get_context();
         try{
-            while(_threads_running){
+            while (not boost::this_thread::interruption_requested()){
                 timeval tv;
                 tv.tv_sec = 0;
                 tv.tv_usec = 100000; //100ms
diff --git a/host/lib/usrp/usrp2/io_impl.cpp b/host/lib/usrp/usrp2/io_impl.cpp
index ffe9a88e7..df452942c 100644
--- a/host/lib/usrp/usrp2/io_impl.cpp
+++ b/host/lib/usrp/usrp2/io_impl.cpp
@@ -146,7 +146,6 @@ struct usrp2_impl::io_impl{
     }
 
     ~io_impl(void){
-        recv_pirate_crew_raiding = false;
         recv_pirate_crew.interrupt_all();
         recv_pirate_crew.join_all();
     }
@@ -185,7 +184,6 @@ struct usrp2_impl::io_impl{
     //methods and variables for the pirate crew
     void recv_pirate_loop(boost::barrier &, usrp2_mboard_impl::sptr, zero_copy_if::sptr, size_t);
     boost::thread_group recv_pirate_crew;
-    bool recv_pirate_crew_raiding;
     bounded_buffer<async_metadata_t> async_msg_fifo;
 };
 
@@ -201,14 +199,13 @@ void usrp2_impl::io_impl::recv_pirate_loop(
     zero_copy_if::sptr err_xport,
     size_t index
 ){
-    recv_pirate_crew_raiding = true;
     spawn_barrier.wait();
     set_thread_priority_safe();
 
     //store a reference to the flow control monitor (offset by max dsps)
     flow_control_monitor &fc_mon = *(this->fc_mons[index*usrp2_mboard_impl::MAX_NUM_DSPS]);
 
-    while(recv_pirate_crew_raiding){
+    while (not boost::this_thread::interruption_requested()){
         managed_recv_buffer::sptr buff = err_xport->get_recv_buff();
         if (not buff.get()) continue; //ignore timeout/error buffers
 
diff --git a/host/utils/usrp2_card_burner.py b/host/utils/usrp2_card_burner.py
index 26adb91c7..43689dd20 100755
--- a/host/utils/usrp2_card_burner.py
+++ b/host/utils/usrp2_card_burner.py
@@ -50,7 +50,7 @@ def command(*args):
         stderr=subprocess.STDOUT,
     )
     ret = p.wait()
-    verbose = p.stdout.read().decode('ascii')
+    verbose = p.stdout.read().decode()
     if ret != 0: raise Exception(verbose)
     return verbose
 
@@ -92,12 +92,12 @@ def get_raw_device_hints():
                 if in_info: info += '\n'+line.strip()
         def is_info_valid(info):
             try:
-                assert 'link to' in info
+                if 'link to' not in info: return False
                 #handles two spellings of remov(e)able:
-                assert 'remov' in info.lower()
-                if 'size is' in info: assert int(extract_info_value(info, 'size is')) <= MAX_SD_CARD_SIZE
-                return True
+                if 'remov' not in info.lower(): return False
+                if 'size is' in info and int(extract_info_value(info, 'size is')) > MAX_SD_CARD_SIZE: return False
             except: return False
+            return True
         def extract_info_name(info):
             for key in ('Mounted on', 'link to'):
                 if key in info: return extract_info_value(info, key)
@@ -110,13 +110,11 @@ def get_raw_device_hints():
     ####################################################################
     if platform.system() == 'Linux':
         devs = list()
-        try: output = open('/proc/partitions', 'r').read().decode('ascii')
-        except: return devs
-        for line in output.splitlines():
+        for line in command('cat', '/proc/partitions').splitlines():
             try:
                 major, minor, blocks, name = line.split()
-                assert not name[-1].isdigit() or int(minor) == 0
-                assert int(blocks)*1024 <= MAX_SD_CARD_SIZE
+                if not name[-1].isdigit() and int(minor) == 0: continue
+                if int(blocks)*1024 > MAX_SD_CARD_SIZE: continue
             except: continue
             devs.append(os.path.join('/dev', name))
 
@@ -128,17 +126,17 @@ def get_raw_device_hints():
     if platform.system() == 'Darwin':
         devs = [d.split()[0] for d in [l for l in command('diskutil', 'list').splitlines() if l.startswith('/dev')]]
         def output_to_info(output):
-            return dict([list(map(str.strip, pair.lower().split(':'))) for pair in [l for l in output.splitlines() if ':' in l]])
+            return dict([list(map(lambda x: x.strip(), pair.lower().split(':'))) for pair in [l for l in output.splitlines() if ':' in l]])
         def is_dev_valid(dev):
             info = output_to_info(command('diskutil', 'info', dev))
             try:
-                if 'internal' in info: assert info['internal'] == 'no'
-                if 'ejectable' in info: assert info['ejectable'] == 'yes'
+                if 'internal' in info and info['internal'] == 'yes': return False
+                if 'ejectable' in info and info['ejectable'] == 'no': return False
                 if 'total size' in info:
                     size_match = re.match('^.*\((\d+)\s*bytes\).*$', info['total size'])
-                    if size_match: assert int(size_match.groups()[0]) <= MAX_SD_CARD_SIZE
-                return True
+                    if size_match and int(size_match.groups()[0]) > MAX_SD_CARD_SIZE: return False
             except: return False
+            return True
 
         return sorted(set(filter(is_dev_valid, devs)))