6 files changed, 283 insertions, 98 deletions
diff --git a/host/lib/transport/CMakeLists.txt b/host/lib/transport/CMakeLists.txt
index 872865d6c..70cf6312d 100644
--- a/host/lib/transport/CMakeLists.txt
+++ b/host/lib/transport/CMakeLists.txt
@@ -18,6 +18,16 @@
 #This file will be included by cmake, use absolute paths!
 
 ########################################################################
+# Check for SIMD headers
+########################################################################
+INCLUDE(CheckIncludeFileCXX)
+CHECK_INCLUDE_FILE_CXX(emmintrin.h HAVE_EMMINTRIN_H)
+
+IF(HAVE_EMMINTRIN_H)
+    ADD_DEFINITIONS(-DHAVE_EMMINTRIN_H)
+ENDIF(HAVE_EMMINTRIN_H)
+
+########################################################################
 # Setup defines for interface address discovery
 ########################################################################
 MESSAGE(STATUS "Configuring interface address discovery...")
@@ -49,6 +59,16 @@ LIBUHD_PYTHON_GEN_SOURCE(
     ${CMAKE_BINARY_DIR}/lib/transport/convert_types.cpp
 )
 
+# append this directory to the include path so the generated convert types
+# can include the implementation convert types file in the source directory
+INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/lib/transport)
+
+# make the generated convert types depend on the implementation header
+SET_SOURCE_FILES_PROPERTIES(
+    ${CMAKE_BINARY_DIR}/lib/transport/convert_types.cpp PROPERTIES
+    OBJECT_DEPENDS ${CMAKE_SOURCE_DIR}/lib/transport/convert_types_impl.hpp
+)
+
 LIBUHD_APPEND_SOURCES(
     ${CMAKE_SOURCE_DIR}/lib/transport/if_addrs.cpp
     ${CMAKE_SOURCE_DIR}/lib/transport/udp_simple.cpp
diff --git a/host/lib/transport/convert_types_impl.hpp b/host/lib/transport/convert_types_impl.hpp
new file mode 100644
index 000000000..5958b08cb
--- /dev/null
+++ b/host/lib/transport/convert_types_impl.hpp
@@ -0,0 +1,201 @@
+//
+// Copyright 2010 Ettus Research LLC
+//
+// This program is free software: you can redistribute it and/or modify
+// it under the terms of the GNU General Public License as published by
+// the Free Software Foundation, either version 3 of the License, or
+// (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program.  If not, see <http://www.gnu.org/licenses/>.
+//
+
+#ifndef INCLUDED_LIBUHD_TRANSPORT_CONVERT_TYPES_IMPL_HPP
+#define INCLUDED_LIBUHD_TRANSPORT_CONVERT_TYPES_IMPL_HPP
+
+#include <uhd/config.hpp>
+#include <uhd/utils/byteswap.hpp>
+#include <boost/cstdint.hpp>
+#include <cstring>
+#include <complex>
+
+#ifdef HAVE_EMMINTRIN_H
+    #define USE_EMMINTRIN_H //use sse2 intrinsics
+#endif
+
+/***********************************************************************
+ * Typedefs
+ **********************************************************************/
+typedef std::complex<float>          fc32_t;
+typedef std::complex<boost::int16_t> sc16_t;
+typedef boost::uint32_t              item32_t;
+
+/***********************************************************************
+ * Convert complex short buffer to items32
+ **********************************************************************/
+static UHD_INLINE void sc16_to_item32_nswap(
+    const sc16_t *input, item32_t *output, size_t nsamps
+){
+    std::memcpy(output, input, nsamps*sizeof(item32_t));
+}
+
+static UHD_INLINE void sc16_to_item32_bswap(
+    const sc16_t *input, item32_t *output, size_t nsamps
+){
+    const item32_t *item32_input = (const item32_t *)input;
+    for (size_t i = 0; i < nsamps; i++){
+        output[i] = uhd::byteswap(item32_input[i]);
+    }
+}
+
+/***********************************************************************
+ * Convert items32 buffer to complex short
+ **********************************************************************/
+static UHD_INLINE void item32_to_sc16_nswap(
+    const item32_t *input, sc16_t *output, size_t nsamps
+){
+    std::memcpy(output, input, nsamps*sizeof(item32_t));
+}
+
+static UHD_INLINE void item32_to_sc16_bswap(
+    const item32_t *input, sc16_t *output, size_t nsamps
+){
+    item32_t *item32_output = (item32_t *)output;
+    for (size_t i = 0; i < nsamps; i++){
+        item32_output[i] = uhd::byteswap(input[i]);
+    }
+}
+
+/***********************************************************************
+ * Convert complex float buffer to items32
+ **********************************************************************/
+static const float shorts_per_float = float(32767);
+
+static UHD_INLINE item32_t fc32_to_item32(fc32_t num){
+    boost::uint16_t real = boost::int16_t(num.real()*shorts_per_float);
+    boost::uint16_t imag = boost::int16_t(num.imag()*shorts_per_float);
+    return (item32_t(real) << 16) | (item32_t(imag) << 0);
+}
+
+static UHD_INLINE void fc32_to_item32_nswap(
+    const fc32_t *input, item32_t *output, size_t nsamps
+){
+    for (size_t i = 0; i < nsamps; i++){
+        output[i] = fc32_to_item32(input[i]);
+    }
+}
+
+#if defined(USE_EMMINTRIN_H)
+#include <emmintrin.h>
+
+static UHD_INLINE void fc32_to_item32_bswap(
+    const fc32_t *input, item32_t *output, size_t nsamps
+){
+    __m128 scalar = _mm_set_ps1(shorts_per_float);
+
+    //convert blocks of samples with intrinsics
+    size_t i = 0; for (; i < (nsamps & ~0x3); i+=4){
+        //load from input
+        __m128 tmplo = _mm_loadu_ps(reinterpret_cast<const float *>(input+i+0));
+        __m128 tmphi = _mm_loadu_ps(reinterpret_cast<const float *>(input+i+2));
+
+        //convert and scale
+        __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar));
+        __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar));
+
+        //pack + byteswap -> byteswap 32 bit words
+        __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi);
+        tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8));
+
+        //store to output
+        _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi);
+    }
+
+    //convert remainder
+    for (; i < nsamps; i++){
+        output[i] = uhd::byteswap(fc32_to_item32(input[i]));
+    }
+}
+
+#else
+static UHD_INLINE void fc32_to_item32_bswap(
+    const fc32_t *input, item32_t *output, size_t nsamps
+){
+    for (size_t i = 0; i < nsamps; i++){
+        output[i] = uhd::byteswap(fc32_to_item32(input[i]));
+    }
+}
+
+#endif
+
+/***********************************************************************
+ * Convert items32 buffer to complex float
+ **********************************************************************/
+static const float floats_per_short = float(1.0/shorts_per_float);
+
+static UHD_INLINE fc32_t item32_to_fc32(item32_t item){
+    return fc32_t(
+        float(boost::int16_t(item >> 16)*floats_per_short),
+        float(boost::int16_t(item >> 0)*floats_per_short)
+    );
+}
+
+static UHD_INLINE void item32_to_fc32_nswap(
+    const item32_t *input, fc32_t *output, size_t nsamps
+){
+    for (size_t i = 0; i < nsamps; i++){
+        output[i] = item32_to_fc32(input[i]);
+    }
+}
+
+#if defined(USE_EMMINTRIN_H)
+#include <emmintrin.h>
+
+static UHD_INLINE void item32_to_fc32_bswap(
+    const item32_t *input, fc32_t *output, size_t nsamps
+){
+    __m128 scalar = _mm_set_ps1(floats_per_short/(1 << 16));
+    __m128i zeroi = _mm_setzero_si128();
+
+    //convert blocks of samples with intrinsics
+    size_t i = 0; for (; i < (nsamps & ~0x3); i+=4){
+        //load from input
+        __m128i tmpi = _mm_loadu_si128(reinterpret_cast<const __m128i *>(input+i));
+
+        //byteswap + unpack -> byteswap 32 bit words
+        tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8));
+        __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); //value in upper 16 bits
+        __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi);
+
+        //convert and scale
+        __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar);
+        __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar);
+
+        //store to output
+        _mm_storeu_ps(reinterpret_cast<float *>(output+i+0), tmplo);
+        _mm_storeu_ps(reinterpret_cast<float *>(output+i+2), tmphi);
+    }
+
+    //convert remainder
+    for (; i < nsamps; i++){
+        output[i] = item32_to_fc32(uhd::byteswap(input[i]));
+    }
+}
+
+#else
+static UHD_INLINE void item32_to_fc32_bswap(
+    const item32_t *input, fc32_t *output, size_t nsamps
+){
+    for (size_t i = 0; i < nsamps; i++){
+        output[i] = item32_to_fc32(uhd::byteswap(input[i]));
+    }
+}
+
+#endif
+
+#endif /* INCLUDED_LIBUHD_TRANSPORT_CONVERT_TYPES_IMPL_HPP */
diff --git a/host/lib/transport/gen_convert_types.py b/host/lib/transport/gen_convert_types.py
index af2bcc7cb..951b634d9 100755
--- a/host/lib/transport/gen_convert_types.py
+++ b/host/lib/transport/gen_convert_types.py
@@ -24,66 +24,15 @@ TMPL_TEXT = """
 
 \#include <uhd/config.hpp>
 \#include <uhd/transport/convert_types.hpp>
-\#include <uhd/utils/byteswap.hpp>
 \#include <boost/cstdint.hpp>
 \#include <boost/detail/endian.hpp>
 \#include <stdexcept>
-\#include <complex>
-
-//define the endian macros to convert integers
-\#ifdef BOOST_BIG_ENDIAN
-    \#define BE_MACRO(x) x
-    \#define LE_MACRO(x) uhd::byteswap(x)
-    static const bool is_big_endian = true;
-\#else
-    \#define BE_MACRO(x) uhd::byteswap(x)
-    \#define LE_MACRO(x) x
-    static const bool is_big_endian = false;
-\#endif
+\#include "convert_types_impl.hpp"
 
 using namespace uhd;
 
 /***********************************************************************
- * Constants
- **********************************************************************/
-typedef std::complex<float>          fc32_t;
-typedef std::complex<boost::int16_t> sc16_t;
-typedef boost::uint32_t              item32_t;
-
-static const float shorts_per_float = float(32767);
-static const float floats_per_short = float(1.0/shorts_per_float);
-
-/***********************************************************************
- * Single-sample converters
- **********************************************************************/
-static UHD_INLINE item32_t sc16_to_item32(sc16_t num){
-    boost::uint16_t real = boost::int16_t(num.real());
-    boost::uint16_t imag = boost::int16_t(num.imag());
-    return (item32_t(real) << 16) | (item32_t(imag) << 0);
-}
-
-static UHD_INLINE sc16_t item32_to_sc16(item32_t item){
-    return sc16_t(
-        boost::uint16_t(item >> 16),
-        boost::uint16_t(item >> 0)
-    );
-}
-
-static UHD_INLINE item32_t fc32_to_item32(fc32_t num){
-    boost::uint16_t real = boost::int16_t(num.real()*shorts_per_float);
-    boost::uint16_t imag = boost::int16_t(num.imag()*shorts_per_float);
-    return (item32_t(real) << 16) | (item32_t(imag) << 0);
-}
-
-static UHD_INLINE fc32_t item32_to_fc32(item32_t item){
-    return fc32_t(
-        float(boost::int16_t(item >> 16)*floats_per_short),
-        float(boost::int16_t(item >> 0)*floats_per_short)
-    );
-}
-
-/***********************************************************************
- * Sample-buffer converters
+ * Generate predicate for jump table
  **********************************************************************/
 UHD_INLINE boost::uint8_t get_pred(
     const io_type_t &io_type,
@@ -92,27 +41,34 @@ UHD_INLINE boost::uint8_t get_pred(
     boost::uint8_t pred = 0;
 
     switch(otw_type.byteorder){
-    case otw_type_t::BO_BIG_ENDIAN:    pred |= $ph.be_p; break;
-    case otw_type_t::BO_LITTLE_ENDIAN: pred |= $ph.le_p; break;
-    ##let the compiler determine the native byte order (we could use python sys.byteorder)
-    case otw_type_t::BO_NATIVE:        pred |= (is_big_endian)? $ph.be_p : $ph.le_p; break;
-    default: throw std::runtime_error("unhandled byteorder type");
+    \#ifdef BOOST_BIG_ENDIAN
+    case otw_type_t::BO_BIG_ENDIAN:    pred |= $ph.nswap_p; break;
+    case otw_type_t::BO_LITTLE_ENDIAN: pred |= $ph.bswap_p; break;
+    \#else
+    case otw_type_t::BO_BIG_ENDIAN:    pred |= $ph.bswap_p; break;
+    case otw_type_t::BO_LITTLE_ENDIAN: pred |= $ph.nswap_p; break;
+    \#endif
+    case otw_type_t::BO_NATIVE:        pred |= $ph.nswap_p; break;
+    default: throw std::runtime_error("unhandled otw byteorder type");
     }
 
-    switch(otw_type.width){
-    case 16: pred |= $ph.w16_p; break;
-    default: throw std::runtime_error("unhandled bit width");
+    switch(otw_type.get_sample_size()){
+    case sizeof(boost::uint32_t): pred |= $ph.item32_p; break;
+    default: throw std::runtime_error("unhandled otw sample size");
     }
 
     switch(io_type.tid){
-    case io_type_t::COMPLEX_INT16:   pred |= $ph.sc16_p; break;
     case io_type_t::COMPLEX_FLOAT32: pred |= $ph.fc32_p; break;
+    case io_type_t::COMPLEX_INT16:   pred |= $ph.sc16_p; break;
     default: throw std::runtime_error("unhandled io type id");
     }
 
     return pred;
 }
 
+/***********************************************************************
+ * Convert host type to device type
+ **********************************************************************/
 void transport::convert_io_type_to_otw_type(
     const void *io_buff, const io_type_t &io_type,
     void *otw_buff, const otw_type_t &otw_type,
@@ -123,16 +79,16 @@ void transport::convert_io_type_to_otw_type(
     case $pred:
         #set $out_type = $ph.get_dev_type($pred)
         #set $in_type = $ph.get_host_type($pred)
-        #set $converter = $in_type+"_to_"+$out_type
-        #set $xe_macro = $ph.get_xe_macro($pred)
-        for (size_t i = 0; i < num_samps; i++){
-            (($(out_type)_t *)otw_buff)[i] = $(xe_macro)($(converter)(((const $(in_type)_t *)io_buff)[i]));
-        }
+        #set $converter = '_'.join([$in_type, 'to', $out_type, $ph.get_swap_type($pred)])
+        $(converter)((const $(in_type)_t *)io_buff, ($(out_type)_t *)otw_buff, num_samps);
         break;
     #end for
     }
 }
 
+/***********************************************************************
+ * Convert device type to host type
+ **********************************************************************/
 void transport::convert_otw_type_to_io_type(
     const void *otw_buff, const otw_type_t &otw_type,
     void *io_buff, const io_type_t &io_type,
@@ -143,11 +99,8 @@ void transport::convert_otw_type_to_io_type(
     case $pred:
         #set $out_type = $ph.get_host_type($pred)
         #set $in_type = $ph.get_dev_type($pred)
-        #set $converter = $in_type+"_to_"+$out_type
-        #set $xe_macro = $ph.get_xe_macro($pred)
-        for (size_t i = 0; i < num_samps; i++){
-            (($(out_type)_t *)io_buff)[i] = $(converter)($(xe_macro)(((const $(in_type)_t *)otw_buff)[i]));
-        }
+        #set $converter = '_'.join([$in_type, 'to', $out_type, $ph.get_swap_type($pred)])
+        $(converter)((const $(in_type)_t *)otw_buff, ($(out_type)_t *)io_buff, num_samps);
         break;
     #end for
     }
@@ -160,29 +113,32 @@ def parse_tmpl(_tmpl_text, **kwargs):
     return str(Template(_tmpl_text, kwargs))
 
 class ph:
-    be_p   = 0b00001
-    le_p   = 0b00000
-    w16_p  = 0b00000
-    sc16_p = 0b00010
-    fc32_p = 0b00000
+    bswap_p  = 0b00001
+    nswap_p  = 0b00000
+    item32_p = 0b00000
+    sc16_p   = 0b00010
+    fc32_p   = 0b00000
 
     nbits = 2 #see above
 
     @staticmethod
-    def get_xe_macro(pred):
-        if (pred & ph.be_p) == ph.be_p: return 'BE_MACRO'
-        if (pred & ph.le_p) == ph.le_p: return 'LE_MACRO'
+    def has(pred, flag): return (pred & flag) == flag
+
+    @staticmethod
+    def get_swap_type(pred):
+        if ph.has(pred, ph.bswap_p): return 'bswap'
+        if ph.has(pred, ph.nswap_p): return 'nswap'
         raise NotImplementedError
 
     @staticmethod
     def get_dev_type(pred):
-        if (pred & ph.w16_p) == ph.w16_p: return 'item32'
+        if ph.has(pred, ph.item32_p): return 'item32'
         raise NotImplementedError
 
     @staticmethod
     def get_host_type(pred):
-        if (pred & ph.sc16_p) == ph.sc16_p: return 'sc16'
-        if (pred & ph.fc32_p) == ph.fc32_p: return 'fc32'
+        if ph.has(pred, ph.sc16_p): return 'sc16'
+        if ph.has(pred, ph.fc32_p): return 'fc32'
         raise NotImplementedError
 
 if __name__ == '__main__':
diff --git a/host/lib/transport/gen_vrt.py b/host/lib/transport/gen_vrt.py
index 6cdd6645d..8e0fce9ff 100755
--- a/host/lib/transport/gen_vrt.py
+++ b/host/lib/transport/gen_vrt.py
@@ -97,7 +97,7 @@ void vrt::pack_$(suffix)(
         #end if
         ########## Integer Time ##########
         #if $pred & $tsi_p
-            header_buff[$num_header_words] = $(XE_MACRO)(metadata.time_spec.secs);
+            header_buff[$num_header_words] = $(XE_MACRO)(boost::uint32_t(metadata.time_spec.get_full_secs()));
             #set $num_header_words += 1
             #set $flags |= (0x3 << 22);
         #end if
@@ -105,7 +105,7 @@ void vrt::pack_$(suffix)(
         #if $pred & $tsf_p
             header_buff[$num_header_words] = 0;
             #set $num_header_words += 1
-            header_buff[$num_header_words] = $(XE_MACRO)(metadata.time_spec.get_ticks(tick_rate));
+            header_buff[$num_header_words] = $(XE_MACRO)(boost::uint32_t(metadata.time_spec.get_tick_count(tick_rate)));
             #set $num_header_words += 1
             #set $flags |= (0x1 << 20);
         #end if
@@ -147,6 +147,7 @@ void vrt::unpack_$(suffix)(
 ){
     //clear the metadata
     metadata = rx_metadata_t();
+    boost::uint32_t secs = 0, ticks = 0;
 
     //extract vrt header
     boost::uint32_t vrt_hdr_word = $(XE_MACRO)(header_buff[0]);
@@ -169,7 +170,7 @@ void vrt::unpack_$(suffix)(
     switch(pred){
     #for $pred in range(2**5)
     case $pred:
-        #set $set_has_time_spec = False
+        #set $has_time_spec = False
         #set $num_header_words = 1
         ########## Stream ID ##########
         #if $pred & $sid_p
@@ -184,21 +185,21 @@ void vrt::unpack_$(suffix)(
         #end if
         ########## Integer Time ##########
         #if $pred & $tsi_p
-            metadata.has_time_spec = true;
-            #set $set_has_time_spec = True
-            metadata.time_spec.secs = $(XE_MACRO)(header_buff[$num_header_words]);
+            #set $has_time_spec = True
+            secs = $(XE_MACRO)(header_buff[$num_header_words]);
             #set $num_header_words += 1
         #end if
         ########## Fractional Time ##########
         #if $pred & $tsf_p
-            #if not $set_has_time_spec
-            metadata.has_time_spec = true;
-                #set $set_has_time_spec = True
-            #end if
+            #set $has_time_spec = True
             #set $num_header_words += 1
-            metadata.time_spec.set_ticks($(XE_MACRO)(header_buff[$num_header_words]), tick_rate);
+            ticks = $(XE_MACRO)(header_buff[$num_header_words]);
             #set $num_header_words += 1
         #end if
+        #if $has_time_spec
+            metadata.has_time_spec = true;
+            metadata.time_spec = time_spec_t(secs, ticks, tick_rate);
+        #end if
         ########## Trailer ##########
         #if $pred & $tlr_p
             #set $num_trailer_words = 1;
diff --git a/host/lib/transport/if_addrs.cpp b/host/lib/transport/if_addrs.cpp
index 5c8c8a176..ad9a2325b 100644
--- a/host/lib/transport/if_addrs.cpp
+++ b/host/lib/transport/if_addrs.cpp
@@ -27,7 +27,7 @@ uhd::transport::if_addrs_t::if_addrs_t(void){
 /***********************************************************************
  * Interface address discovery through ifaddrs api
  **********************************************************************/
-#ifdef HAVE_IFADDRS_H
+#if defined(HAVE_IFADDRS_H)
 #include <ifaddrs.h>
 
 static boost::asio::ip::address_v4 sockaddr_to_ip_addr(sockaddr *addr){
@@ -59,9 +59,9 @@ std::vector<uhd::transport::if_addrs_t> uhd::transport::get_if_addrs(void){
 }
 
 /***********************************************************************
- * Interface address discovery through windows api (TODO)
+ * Interface address discovery through windows api
  **********************************************************************/
-#elif HAVE_WINSOCK2_H
+#elif defined(HAVE_WINSOCK2_H)
 #include <winsock2.h>
 
 std::vector<uhd::transport::if_addrs_t> uhd::transport::get_if_addrs(void){
diff --git a/host/lib/transport/udp_zero_copy_asio.cpp b/host/lib/transport/udp_zero_copy_asio.cpp
index c3c02707e..7f9292d24 100644
--- a/host/lib/transport/udp_zero_copy_asio.cpp
+++ b/host/lib/transport/udp_zero_copy_asio.cpp
@@ -27,7 +27,8 @@ using namespace uhd::transport;
 /***********************************************************************
  * Constants
  **********************************************************************/
-static const size_t MIN_SOCK_BUFF_SIZE = size_t(100e3);
+//enough buffering for half a second of samples at full rate on usrp2
+static const size_t MIN_SOCK_BUFF_SIZE = size_t(sizeof(boost::uint32_t) * 25e6 * 0.5);
 static const size_t MAX_DGRAM_SIZE = 1500; //assume max size on send and recv
 static const double RECV_TIMEOUT = 0.1; //100 ms
 
@@ -159,6 +160,12 @@ template<typename Opt> static void resize_buff_helper(
     //otherwise, ensure that the buffer is at least the minimum size
     else if (udp_trans->get_buff_size<Opt>() < MIN_SOCK_BUFF_SIZE){
         resize_buff_helper<Opt>(udp_trans, MIN_SOCK_BUFF_SIZE, name);
+        if (udp_trans->get_buff_size<Opt>() < MIN_SOCK_BUFF_SIZE){
+            std::cerr << boost::format(
+                "Warning: the %s buffer size is smaller than the recommended size of %d bytes.\n"
+                "    See the USRP2 application notes on buffer resizing."
+            ) % name % MIN_SOCK_BUFF_SIZE << std::endl;
+        }
     }
 }