From ba0e3c8dcc90ea3e4cf015dea1038cb51a89b159 Mon Sep 17 00:00:00 2001 From: Jason Abele Date: Fri, 8 Jul 2011 12:56:29 -0700 Subject: Fix rounding in DBSRX2 tuning --- host/lib/usrp/dboard/db_dbsrx2.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'host/lib') diff --git a/host/lib/usrp/dboard/db_dbsrx2.cpp b/host/lib/usrp/dboard/db_dbsrx2.cpp index aaced7a5d..f646a238f 100644 --- a/host/lib/usrp/dboard/db_dbsrx2.cpp +++ b/host/lib/usrp/dboard/db_dbsrx2.cpp @@ -230,7 +230,7 @@ void dbsrx2::set_lo_freq(double target_freq){ N = (target_freq*R*ext_div)/(ref_freq); //actual spec range is (19, 251) intdiv = int(std::floor(N)); // if (intdiv < 19 or intdiv > 251) continue; - fracdiv = boost::math::iround((N - intdiv)*double(1 << 20)); + fracdiv = std::floor((N - intdiv)*double(1 << 20)); //calculate the actual freq from the values above N = double(intdiv) + double(fracdiv)/double(1 << 20); -- cgit v1.2.3 From 5239879e9f97bdbb6e3c531cee85824823ebff89 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Mon, 11 Jul 2011 13:02:21 -0700 Subject: usrp2: added a place for product code in eeprom map --- host/lib/usrp/mboard_eeprom.cpp | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'host/lib') diff --git a/host/lib/usrp/mboard_eeprom.cpp b/host/lib/usrp/mboard_eeprom.cpp index 2ee4a9284..f65b0aac8 100644 --- a/host/lib/usrp/mboard_eeprom.cpp +++ b/host/lib/usrp/mboard_eeprom.cpp @@ -75,6 +75,7 @@ static const uhd::dict USRP_N100_OFFSETS = boost::a ("mac-addr", 0x02) ("ip-addr", 0x0C) //leave space here for other addresses (perhaps) + ("prod-lsb-msb", 0x14) ("gpsdo", 0x17) ("serial", 0x18) ("name", 0x18 + SERIAL_LEN) @@ -92,6 +93,11 @@ static void load_n100(mboard_eeprom_t &mb_eeprom, i2c_iface &iface){ boost::uint16_t rev = (boost::uint16_t(rev_lsb_msb.at(0)) << 0) | (boost::uint16_t(rev_lsb_msb.at(1)) << 8); mb_eeprom["rev"] = boost::lexical_cast(rev); + //extract the product code + byte_vector_t prod_lsb_msb = iface.read_eeprom(N100_EEPROM_ADDR, USRP_N100_OFFSETS["prod-lsb-msb"], 2); + boost::uint16_t prod = (boost::uint16_t(prod_lsb_msb.at(0)) << 0) | (boost::uint16_t(prod_lsb_msb.at(1)) << 8); + mb_eeprom["product"] = (prod == 0 or prod == 0xffff)? "" : boost::lexical_cast(prod); + //extract the addresses mb_eeprom["mac-addr"] = mac_addr_t::from_bytes(iface.read_eeprom( N100_EEPROM_ADDR, USRP_N100_OFFSETS["mac-addr"], 6 @@ -140,6 +146,16 @@ static void store_n100(const mboard_eeprom_t &mb_eeprom, i2c_iface &iface){ iface.write_eeprom(N100_EEPROM_ADDR, USRP_N100_OFFSETS["rev-lsb-msb"], rev_lsb_msb); } + //parse the product code + if (mb_eeprom.has_key("product")){ + boost::uint16_t prod = boost::lexical_cast(mb_eeprom["product"]); + byte_vector_t prod_lsb_msb = boost::assign::list_of + (boost::uint8_t(prod >> 0)) + (boost::uint8_t(prod >> 8)) + ; + iface.write_eeprom(N100_EEPROM_ADDR, USRP_N100_OFFSETS["prod-lsb-msb"], prod_lsb_msb); + } + //store the addresses if (mb_eeprom.has_key("mac-addr")) iface.write_eeprom( N100_EEPROM_ADDR, USRP_N100_OFFSETS["mac-addr"], -- cgit v1.2.3 From 3fea33db9a5d951b9db7c7653eaae0e6291de0bc Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Thu, 14 Jul 2011 22:41:56 -0700 Subject: uhd: pulled misc changes from other branches into master --- host/CMakeLists.txt | 2 +- host/examples/benchmark_rate.cpp | 3 ++- host/lib/convert/convert_with_sse2.cpp | 40 +++++++++++++++++----------------- 3 files changed, 23 insertions(+), 22 deletions(-) (limited to 'host/lib') diff --git a/host/CMakeLists.txt b/host/CMakeLists.txt index 034257551..b5f8e57c2 100644 --- a/host/CMakeLists.txt +++ b/host/CMakeLists.txt @@ -129,7 +129,7 @@ IF(MSVC) ENDIF(BOOST_ALL_DYN_LINK) ENDIF(MSVC) -SET(Boost_ADDITIONAL_VERSIONS "1.42.0" "1.42" "1.43.0" "1.43" "1.44.0" "1.44" "1.45.0" "1.45" "1.46.0" "1.46") +SET(Boost_ADDITIONAL_VERSIONS "1.42.0" "1.42" "1.43.0" "1.43" "1.44.0" "1.44" "1.45.0" "1.45" "1.46.0" "1.46" "1.47.0" "1.47") FIND_PACKAGE(Boost 1.36 COMPONENTS ${BOOST_REQUIRED_COMPONENTS}) INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS}) diff --git a/host/examples/benchmark_rate.cpp b/host/examples/benchmark_rate.cpp index 688cd797a..774b240d4 100644 --- a/host/examples/benchmark_rate.cpp +++ b/host/examples/benchmark_rate.cpp @@ -157,6 +157,7 @@ void benchmark_tx_rate_async_helper(uhd::usrp::multi_usrp::sptr usrp){ * Main code + dispatcher **********************************************************************/ int UHD_SAFE_MAIN(int argc, char *argv[]){ + uhd::set_thread_priority_safe(); //variables to be set by po std::string args; @@ -177,7 +178,7 @@ int UHD_SAFE_MAIN(int argc, char *argv[]){ po::notify(vm); //print the help message - if (vm.count("help")){ + if (vm.count("help") or (vm.count("rx_rate") + vm.count("tx_rate")) == 0){ std::cout << boost::format("UHD Benchmark Rate %s") % desc << std::endl; std::cout << " Specify --rx_rate for a receive-only test.\n" diff --git a/host/lib/convert/convert_with_sse2.cpp b/host/lib/convert/convert_with_sse2.cpp index 0cbb2c444..0649baab4 100644 --- a/host/lib/convert/convert_with_sse2.cpp +++ b/host/lib/convert/convert_with_sse2.cpp @@ -28,10 +28,10 @@ DECLARE_CONVERTER(convert_fc32_1_to_item32_1_nswap, PRIORITY_CUSTOM){ const __m128 scalar = _mm_set_ps1(float(scale_factor)); #define convert_fc32_1_to_item32_1_nswap_guts(_al_) \ - for (; i+4 < nsamps; i+=4){ \ + for (; i+4 < nsamps; i+=4){ \ /* load from input */ \ - __m128 tmplo = _mm_load ## _al_ ## _ps(reinterpret_cast(input+i+0)); \ - __m128 tmphi = _mm_load ## _al_ ## _ps(reinterpret_cast(input+i+2)); \ + __m128 tmplo = _mm_load ## _al_ ## ps(reinterpret_cast(input+i+0)); \ + __m128 tmphi = _mm_load ## _al_ ## ps(reinterpret_cast(input+i+2)); \ \ /* convert and scale */ \ __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar)); \ @@ -53,9 +53,9 @@ DECLARE_CONVERTER(convert_fc32_1_to_item32_1_nswap, PRIORITY_CUSTOM){ case 0x8: output[i] = fc32_to_item32(input[i], float(scale_factor)); i++; case 0x0: - convert_fc32_1_to_item32_1_nswap_guts() + convert_fc32_1_to_item32_1_nswap_guts(_) break; - default: convert_fc32_1_to_item32_1_nswap_guts(u) + default: convert_fc32_1_to_item32_1_nswap_guts(u_) } //convert remainder @@ -71,10 +71,10 @@ DECLARE_CONVERTER(convert_fc32_1_to_item32_1_bswap, PRIORITY_CUSTOM){ const __m128 scalar = _mm_set_ps1(float(scale_factor)); #define convert_fc32_1_to_item32_1_bswap_guts(_al_) \ - for (; i+4 < nsamps; i+=4){ \ + for (; i+4 < nsamps; i+=4){ \ /* load from input */ \ - __m128 tmplo = _mm_load ## _al_ ## _ps(reinterpret_cast(input+i+0)); \ - __m128 tmphi = _mm_load ## _al_ ## _ps(reinterpret_cast(input+i+2)); \ + __m128 tmplo = _mm_load ## _al_ ## ps(reinterpret_cast(input+i+0)); \ + __m128 tmphi = _mm_load ## _al_ ## ps(reinterpret_cast(input+i+2)); \ \ /* convert and scale */ \ __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar)); \ @@ -95,9 +95,9 @@ DECLARE_CONVERTER(convert_fc32_1_to_item32_1_bswap, PRIORITY_CUSTOM){ case 0x8: output[i] = uhd::byteswap(fc32_to_item32(input[i], float(scale_factor))); i++; case 0x0: - convert_fc32_1_to_item32_1_bswap_guts() + convert_fc32_1_to_item32_1_bswap_guts(_) break; - default: convert_fc32_1_to_item32_1_bswap_guts(u) + default: convert_fc32_1_to_item32_1_bswap_guts(u_) } //convert remainder @@ -114,7 +114,7 @@ DECLARE_CONVERTER(convert_item32_1_to_fc32_1_nswap, PRIORITY_CUSTOM){ const __m128i zeroi = _mm_setzero_si128(); #define convert_item32_1_to_fc32_1_nswap_guts(_al_) \ - for (; i+4 < nsamps; i+=4){ \ + for (; i+4 < nsamps; i+=4){ \ /* load from input */ \ __m128i tmpi = _mm_loadu_si128(reinterpret_cast(input+i)); \ \ @@ -129,8 +129,8 @@ DECLARE_CONVERTER(convert_item32_1_to_fc32_1_nswap, PRIORITY_CUSTOM){ __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar); \ \ /* store to output */ \ - _mm_store ## _al_ ## _ps(reinterpret_cast(output+i+0), tmplo); \ - _mm_store ## _al_ ## _ps(reinterpret_cast(output+i+2), tmphi); \ + _mm_store ## _al_ ## ps(reinterpret_cast(output+i+0), tmplo); \ + _mm_store ## _al_ ## ps(reinterpret_cast(output+i+2), tmphi); \ } \ size_t i = 0; @@ -140,9 +140,9 @@ DECLARE_CONVERTER(convert_item32_1_to_fc32_1_nswap, PRIORITY_CUSTOM){ case 0x8: output[i] = item32_to_fc32(input[i], float(scale_factor)); i++; case 0x0: - convert_item32_1_to_fc32_1_nswap_guts() + convert_item32_1_to_fc32_1_nswap_guts(_) break; - default: convert_item32_1_to_fc32_1_nswap_guts(u) + default: convert_item32_1_to_fc32_1_nswap_guts(u_) } //convert remainder @@ -159,7 +159,7 @@ DECLARE_CONVERTER(convert_item32_1_to_fc32_1_bswap, PRIORITY_CUSTOM){ const __m128i zeroi = _mm_setzero_si128(); #define convert_item32_1_to_fc32_1_bswap_guts(_al_) \ - for (; i+4 < nsamps; i+=4){ \ + for (; i+4 < nsamps; i+=4){ \ /* load from input */ \ __m128i tmpi = _mm_loadu_si128(reinterpret_cast(input+i)); \ \ @@ -173,8 +173,8 @@ DECLARE_CONVERTER(convert_item32_1_to_fc32_1_bswap, PRIORITY_CUSTOM){ __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar); \ \ /* store to output */ \ - _mm_store ## _al_ ## _ps(reinterpret_cast(output+i+0), tmplo); \ - _mm_store ## _al_ ## _ps(reinterpret_cast(output+i+2), tmphi); \ + _mm_store ## _al_ ## ps(reinterpret_cast(output+i+0), tmplo); \ + _mm_store ## _al_ ## ps(reinterpret_cast(output+i+2), tmphi); \ } \ size_t i = 0; @@ -184,9 +184,9 @@ DECLARE_CONVERTER(convert_item32_1_to_fc32_1_bswap, PRIORITY_CUSTOM){ case 0x8: output[i] = item32_to_fc32(uhd::byteswap(input[i]), float(scale_factor)); i++; case 0x0: - convert_item32_1_to_fc32_1_bswap_guts() + convert_item32_1_to_fc32_1_bswap_guts(_) break; - default: convert_item32_1_to_fc32_1_bswap_guts(u) + default: convert_item32_1_to_fc32_1_bswap_guts(u_) } //convert remainder -- cgit v1.2.3 From 7c84c240726114525b2501fb8e1bd8b112690983 Mon Sep 17 00:00:00 2001 From: Jason Abele Date: Fri, 15 Jul 2011 14:21:22 -0700 Subject: Updates to WBX dboard driver for version 3 support Use step attenuator for TX gain control --- host/lib/usrp/dboard/db_wbx_common.cpp | 40 +++++++++++++++++++++------------- host/lib/usrp/dboard/db_wbx_simple.cpp | 12 +++++++--- 2 files changed, 34 insertions(+), 18 deletions(-) (limited to 'host/lib') diff --git a/host/lib/usrp/dboard/db_wbx_common.cpp b/host/lib/usrp/dboard/db_wbx_common.cpp index 1a016e89c..c21ba80dc 100644 --- a/host/lib/usrp/dboard/db_wbx_common.cpp +++ b/host/lib/usrp/dboard/db_wbx_common.cpp @@ -44,7 +44,7 @@ #define TX_ATTN_MASK (TX_ATTN_16|TX_ATTN_8|TX_ATTN_4|TX_ATTN_2|TX_ATTN_1) // valid bits of TX Attenuator Control // Mixer functions -#define TX_MIXER_ENB (TXMOD_EN|ADF4350_PDBRF) +#define TX_MIXER_ENB (TXMOD_EN|ADF4350_PDBRF) // for v3, TXMOD_EN tied to ADF4350_PDBRF rather than separate #define TX_MIXER_DIS 0 #define RX_MIXER_ENB (RXBB_PDB|ADF4350_PDBRF) @@ -102,18 +102,19 @@ wbx_base::wbx_base(ctor_args_t args) : xcvr_dboard_base(args){ //v3 has different io bits for attenuator control int v3_iobits = is_v3() ? TX_ATTN_MASK : ADF4350_CE; + int v3_tx_mod = is_v3() ? ADF4350_PDBRF : TXMOD_EN|ADF4350_PDBRF; //set the gpio directions and atr controls - this->get_iface()->set_pin_ctrl(dboard_iface::UNIT_TX, TXMOD_EN|ADF4350_PDBRF); + this->get_iface()->set_pin_ctrl(dboard_iface::UNIT_TX, v3_tx_mod); this->get_iface()->set_pin_ctrl(dboard_iface::UNIT_RX, RXBB_PDB|ADF4350_PDBRF); - this->get_iface()->set_gpio_ddr(dboard_iface::UNIT_TX, TX_PUP_5V|TX_PUP_3V|TXMOD_EN|ADF4350_PDBRF|v3_iobits); + this->get_iface()->set_gpio_ddr(dboard_iface::UNIT_TX, TX_PUP_5V|TX_PUP_3V|v3_tx_mod|v3_iobits); this->get_iface()->set_gpio_ddr(dboard_iface::UNIT_RX, RX_PUP_5V|RX_PUP_3V|ADF4350_CE|RXBB_PDB|ADF4350_PDBRF|RX_ATTN_MASK); //setup ATR for the mixer enables (always enabled to prevent phase slip between bursts) - this->get_iface()->set_atr_reg(dboard_iface::UNIT_TX, dboard_iface::ATR_REG_IDLE, TX_MIXER_ENB, TX_MIXER_DIS | TX_MIXER_ENB); - this->get_iface()->set_atr_reg(dboard_iface::UNIT_TX, dboard_iface::ATR_REG_RX_ONLY, TX_MIXER_ENB, TX_MIXER_DIS | TX_MIXER_ENB); - this->get_iface()->set_atr_reg(dboard_iface::UNIT_TX, dboard_iface::ATR_REG_TX_ONLY, TX_MIXER_ENB, TX_MIXER_DIS | TX_MIXER_ENB); - this->get_iface()->set_atr_reg(dboard_iface::UNIT_TX, dboard_iface::ATR_REG_FULL_DUPLEX, TX_MIXER_ENB, TX_MIXER_DIS | TX_MIXER_ENB); + this->get_iface()->set_atr_reg(dboard_iface::UNIT_TX, dboard_iface::ATR_REG_IDLE, v3_tx_mod, TX_MIXER_DIS | v3_tx_mod); + this->get_iface()->set_atr_reg(dboard_iface::UNIT_TX, dboard_iface::ATR_REG_RX_ONLY, v3_tx_mod, TX_MIXER_DIS | v3_tx_mod); + this->get_iface()->set_atr_reg(dboard_iface::UNIT_TX, dboard_iface::ATR_REG_TX_ONLY, v3_tx_mod, TX_MIXER_DIS | v3_tx_mod); + this->get_iface()->set_atr_reg(dboard_iface::UNIT_TX, dboard_iface::ATR_REG_FULL_DUPLEX, v3_tx_mod, TX_MIXER_DIS | v3_tx_mod); this->get_iface()->set_atr_reg(dboard_iface::UNIT_RX, dboard_iface::ATR_REG_IDLE, RX_MIXER_ENB, RX_MIXER_DIS | RX_MIXER_ENB); this->get_iface()->set_atr_reg(dboard_iface::UNIT_RX, dboard_iface::ATR_REG_TX_ONLY, RX_MIXER_ENB, RX_MIXER_DIS | RX_MIXER_ENB); @@ -191,7 +192,7 @@ static int tx_pga0_gain_to_iobits(double &gain){ double attn = wbx_v3_tx_gain_ranges["PGA0"].stop() - gain; //calculate the attenuation - int attn_code = boost::math::iround(attn*2); + int attn_code = boost::math::iround(attn); int iobits = ( (attn_code & 16 ? 0 : TX_ATTN_16) | (attn_code & 8 ? 0 : TX_ATTN_8) | @@ -236,11 +237,11 @@ void wbx_base::set_tx_gain(double gain, const std::string &name){ if (is_v3()) { assert_has(wbx_v3_tx_gain_ranges.keys(), name, "wbx tx gain name"); if(name == "PGA0"){ - double dac_volts = tx_pga0_gain_to_iobits(gain); + boost::uint16_t io_bits = tx_pga0_gain_to_iobits(gain); _tx_gains[name] = gain; - //write the new voltage to the aux dac - this->get_iface()->write_aux_dac(dboard_iface::UNIT_TX, dboard_iface::AUX_DAC_A, dac_volts); + //write the new gain to tx gpio outputs + this->get_iface()->set_gpio_out(dboard_iface::UNIT_TX, io_bits, TX_ATTN_MASK); } else UHD_THROW_INVALID_CODE_PATH(); } @@ -447,7 +448,7 @@ bool wbx_base::get_locked(dboard_iface::unit_t unit){ } bool wbx_base::is_v3(void){ - return get_rx_id() == 0x057; + return get_rx_id().to_uint16() == 0x057; } /*********************************************************************** @@ -570,12 +571,21 @@ void wbx_base::tx_get(const wax::obj &key_, wax::obj &val){ return; case SUBDEV_PROP_GAIN_RANGE: - assert_has(wbx_tx_gain_ranges.keys(), key.name, "wbx tx gain name"); - val = wbx_tx_gain_ranges[key.name]; + if (is_v3()) { + assert_has(wbx_v3_tx_gain_ranges.keys(), key.name, "wbx tx gain name"); + val = wbx_v3_tx_gain_ranges[key.name]; + } + else { + assert_has(wbx_tx_gain_ranges.keys(), key.name, "wbx tx gain name"); + val = wbx_tx_gain_ranges[key.name]; + } return; case SUBDEV_PROP_GAIN_NAMES: - val = prop_names_t(wbx_tx_gain_ranges.keys()); + if (is_v3()) + val = prop_names_t(wbx_v3_tx_gain_ranges.keys()); + else + val = prop_names_t(wbx_tx_gain_ranges.keys()); return; case SUBDEV_PROP_FREQ: diff --git a/host/lib/usrp/dboard/db_wbx_simple.cpp b/host/lib/usrp/dboard/db_wbx_simple.cpp index 602ce389d..aa4937b19 100644 --- a/host/lib/usrp/dboard/db_wbx_simple.cpp +++ b/host/lib/usrp/dboard/db_wbx_simple.cpp @@ -16,7 +16,7 @@ // // Antenna constants -#define ANTSW_IO ((1 << 5)|(1 << 15)) // on UNIT_TX, 0 = TX, 1 = RX, on UNIT_RX 0 = main ant, 1 = RX2 +#define ANTSW_IO ((1 << 15)) // on UNIT_TX, 0 = TX, 1 = RX, on UNIT_RX 0 = main ant, 1 = RX2 #define ANT_TX 0 //the tx line is transmitting #define ANT_RX ANTSW_IO //the tx line is receiving #define ANT_TXRX 0 //the rx line is on txrx @@ -150,7 +150,10 @@ void wbx_simple::rx_get(const wax::obj &key_, wax::obj &val){ //handle the get request conditioned on the key switch(key.as()){ case SUBDEV_PROP_NAME: - val = std::string("WBX RX + Simple GDB"); + if (is_v3()) + val = std::string("WBX v3 RX + Simple GDB"); + else + val = std::string("WBX RX + Simple GDB"); return; case SUBDEV_PROP_FREQ: @@ -204,7 +207,10 @@ void wbx_simple::tx_get(const wax::obj &key_, wax::obj &val){ //handle the get request conditioned on the key switch(key.as()){ case SUBDEV_PROP_NAME: - val = std::string("WBX TX + Simple GDB"); + if (is_v3()) + val = std::string("WBX v3 TX + Simple GDB"); + else + val = std::string("WBX TX + Simple GDB"); return; case SUBDEV_PROP_FREQ: -- cgit v1.2.3 From 87d67d7777ed21121896b7733723ca3109e18c8c Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Fri, 15 Jul 2011 13:36:56 -0700 Subject: uhd: created SSE2 conversion routines for fc64 --- host/lib/convert/CMakeLists.txt | 10 +- host/lib/convert/convert_fc32_with_sse2.cpp | 196 +++++++++++++++++++++++++ host/lib/convert/convert_fc64_with_sse2.cpp | 212 ++++++++++++++++++++++++++++ host/lib/convert/convert_with_sse2.cpp | 196 ------------------------- 4 files changed, 414 insertions(+), 200 deletions(-) create mode 100644 host/lib/convert/convert_fc32_with_sse2.cpp create mode 100644 host/lib/convert/convert_fc64_with_sse2.cpp delete mode 100644 host/lib/convert/convert_with_sse2.cpp (limited to 'host/lib') diff --git a/host/lib/convert/CMakeLists.txt b/host/lib/convert/CMakeLists.txt index 5f05b0cb8..e6e8ec088 100644 --- a/host/lib/convert/CMakeLists.txt +++ b/host/lib/convert/CMakeLists.txt @@ -36,13 +36,15 @@ CHECK_INCLUDE_FILE_CXX(emmintrin.h HAVE_EMMINTRIN_H) UNSET(CMAKE_REQUIRED_FLAGS) IF(HAVE_EMMINTRIN_H) + SET(convert_with_sse2_sources + ${CMAKE_CURRENT_SOURCE_DIR}/convert_fc32_with_sse2.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/convert_fc64_with_sse2.cpp + ) SET_SOURCE_FILES_PROPERTIES( - ${CMAKE_CURRENT_SOURCE_DIR}/convert_with_sse2.cpp + ${convert_with_sse2_sources} PROPERTIES COMPILE_FLAGS "${EMMINTRIN_FLAGS}" ) - LIBUHD_APPEND_SOURCES( - ${CMAKE_CURRENT_SOURCE_DIR}/convert_with_sse2.cpp - ) + LIBUHD_APPEND_SOURCES(${convert_with_sse2_sources}) ENDIF(HAVE_EMMINTRIN_H) ######################################################################## diff --git a/host/lib/convert/convert_fc32_with_sse2.cpp b/host/lib/convert/convert_fc32_with_sse2.cpp new file mode 100644 index 000000000..676e1561c --- /dev/null +++ b/host/lib/convert/convert_fc32_with_sse2.cpp @@ -0,0 +1,196 @@ +// +// Copyright 2011 Ettus Research LLC +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// + +#include "convert_common.hpp" +#include +#include + +using namespace uhd::convert; + +DECLARE_CONVERTER(convert_fc32_1_to_item32_1_nswap, PRIORITY_CUSTOM){ + const fc32_t *input = reinterpret_cast(inputs[0]); + item32_t *output = reinterpret_cast(outputs[0]); + + const __m128 scalar = _mm_set_ps1(float(scale_factor)); + + #define convert_fc32_1_to_item32_1_nswap_guts(_al_) \ + for (; i+4 < nsamps; i+=4){ \ + /* load from input */ \ + __m128 tmplo = _mm_load ## _al_ ## ps(reinterpret_cast(input+i+0)); \ + __m128 tmphi = _mm_load ## _al_ ## ps(reinterpret_cast(input+i+2)); \ + \ + /* convert and scale */ \ + __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar)); \ + __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar)); \ + \ + /* pack + swap 16-bit pairs */ \ + __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ + tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + \ + /* store to output */ \ + _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \ + } \ + + size_t i = 0; + + //dispatch according to alignment + switch (size_t(input) & 0xf){ + case 0x8: + output[i] = fc32_to_item32(input[i], float(scale_factor)); i++; + case 0x0: + convert_fc32_1_to_item32_1_nswap_guts(_) + break; + default: convert_fc32_1_to_item32_1_nswap_guts(u_) + } + + //convert remainder + for (; i < nsamps; i++){ + output[i] = fc32_to_item32(input[i], float(scale_factor)); + } +} + +DECLARE_CONVERTER(convert_fc32_1_to_item32_1_bswap, PRIORITY_CUSTOM){ + const fc32_t *input = reinterpret_cast(inputs[0]); + item32_t *output = reinterpret_cast(outputs[0]); + + const __m128 scalar = _mm_set_ps1(float(scale_factor)); + + #define convert_fc32_1_to_item32_1_bswap_guts(_al_) \ + for (; i+4 < nsamps; i+=4){ \ + /* load from input */ \ + __m128 tmplo = _mm_load ## _al_ ## ps(reinterpret_cast(input+i+0)); \ + __m128 tmphi = _mm_load ## _al_ ## ps(reinterpret_cast(input+i+2)); \ + \ + /* convert and scale */ \ + __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar)); \ + __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar)); \ + \ + /* pack + byteswap -> byteswap 16 bit words */ \ + __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ + tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ + \ + /* store to output */ \ + _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \ + } \ + + size_t i = 0; + + //dispatch according to alignment + switch (size_t(input) & 0xf){ + case 0x8: + output[i] = uhd::byteswap(fc32_to_item32(input[i], float(scale_factor))); i++; + case 0x0: + convert_fc32_1_to_item32_1_bswap_guts(_) + break; + default: convert_fc32_1_to_item32_1_bswap_guts(u_) + } + + //convert remainder + for (; i < nsamps; i++){ + output[i] = uhd::byteswap(fc32_to_item32(input[i], float(scale_factor))); + } +} + +DECLARE_CONVERTER(convert_item32_1_to_fc32_1_nswap, PRIORITY_CUSTOM){ + const item32_t *input = reinterpret_cast(inputs[0]); + fc32_t *output = reinterpret_cast(outputs[0]); + + const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 16)); + const __m128i zeroi = _mm_setzero_si128(); + + #define convert_item32_1_to_fc32_1_nswap_guts(_al_) \ + for (; i+4 < nsamps; i+=4){ \ + /* load from input */ \ + __m128i tmpi = _mm_loadu_si128(reinterpret_cast(input+i)); \ + \ + /* unpack + swap 16-bit pairs */ \ + tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \ + __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \ + \ + /* convert and scale */ \ + __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar); \ + __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar); \ + \ + /* store to output */ \ + _mm_store ## _al_ ## ps(reinterpret_cast(output+i+0), tmplo); \ + _mm_store ## _al_ ## ps(reinterpret_cast(output+i+2), tmphi); \ + } \ + + size_t i = 0; + + //dispatch according to alignment + switch (size_t(output) & 0xf){ + case 0x8: + output[i] = item32_to_fc32(input[i], float(scale_factor)); i++; + case 0x0: + convert_item32_1_to_fc32_1_nswap_guts(_) + break; + default: convert_item32_1_to_fc32_1_nswap_guts(u_) + } + + //convert remainder + for (; i < nsamps; i++){ + output[i] = item32_to_fc32(input[i], float(scale_factor)); + } +} + +DECLARE_CONVERTER(convert_item32_1_to_fc32_1_bswap, PRIORITY_CUSTOM){ + const item32_t *input = reinterpret_cast(inputs[0]); + fc32_t *output = reinterpret_cast(outputs[0]); + + const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 16)); + const __m128i zeroi = _mm_setzero_si128(); + + #define convert_item32_1_to_fc32_1_bswap_guts(_al_) \ + for (; i+4 < nsamps; i+=4){ \ + /* load from input */ \ + __m128i tmpi = _mm_loadu_si128(reinterpret_cast(input+i)); \ + \ + /* byteswap + unpack -> byteswap 16 bit words */ \ + tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ + __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \ + __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \ + \ + /* convert and scale */ \ + __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar); \ + __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar); \ + \ + /* store to output */ \ + _mm_store ## _al_ ## ps(reinterpret_cast(output+i+0), tmplo); \ + _mm_store ## _al_ ## ps(reinterpret_cast(output+i+2), tmphi); \ + } \ + + size_t i = 0; + + //dispatch according to alignment + switch (size_t(output) & 0xf){ + case 0x8: + output[i] = item32_to_fc32(uhd::byteswap(input[i]), float(scale_factor)); i++; + case 0x0: + convert_item32_1_to_fc32_1_bswap_guts(_) + break; + default: convert_item32_1_to_fc32_1_bswap_guts(u_) + } + + //convert remainder + for (; i < nsamps; i++){ + output[i] = item32_to_fc32(uhd::byteswap(input[i]), float(scale_factor)); + } +} diff --git a/host/lib/convert/convert_fc64_with_sse2.cpp b/host/lib/convert/convert_fc64_with_sse2.cpp new file mode 100644 index 000000000..4d28396a4 --- /dev/null +++ b/host/lib/convert/convert_fc64_with_sse2.cpp @@ -0,0 +1,212 @@ +// +// Copyright 2011 Ettus Research LLC +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// + +#include "convert_common.hpp" +#include +#include + +using namespace uhd::convert; + +DECLARE_CONVERTER(convert_fc64_1_to_item32_1_nswap, PRIORITY_CUSTOM){ + const fc64_t *input = reinterpret_cast(inputs[0]); + item32_t *output = reinterpret_cast(outputs[0]); + + const __m128d scalar = _mm_set1_pd(scale_factor); + + #define convert_fc64_1_to_item32_1_nswap_guts(_al_) \ + for (; i+4 < nsamps; i+=4){ \ + /* load from input */ \ + __m128d tmp0 = _mm_load ## _al_ ## pd(reinterpret_cast(input+i+0)); \ + __m128d tmp1 = _mm_load ## _al_ ## pd(reinterpret_cast(input+i+1)); \ + __m128d tmp2 = _mm_load ## _al_ ## pd(reinterpret_cast(input+i+2)); \ + __m128d tmp3 = _mm_load ## _al_ ## pd(reinterpret_cast(input+i+3)); \ + \ + /* convert and scale */ \ + __m128i tmpi0 = _mm_cvttpd_epi32(_mm_mul_pd(tmp0, scalar)); \ + __m128i tmpi1 = _mm_cvttpd_epi32(_mm_mul_pd(tmp1, scalar)); \ + __m128i tmpilo = _mm_unpacklo_epi64(tmpi0, tmpi1); \ + __m128i tmpi2 = _mm_cvttpd_epi32(_mm_mul_pd(tmp2, scalar)); \ + __m128i tmpi3 = _mm_cvttpd_epi32(_mm_mul_pd(tmp3, scalar)); \ + __m128i tmpihi = _mm_unpacklo_epi64(tmpi2, tmpi3); \ + \ + /* pack + swap 16-bit pairs */ \ + __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ + tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + \ + /* store to output */ \ + _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \ + } \ + + size_t i = 0; + + //dispatch according to alignment + if ((size_t(input) & 0xf) == 0){ + convert_fc64_1_to_item32_1_nswap_guts(_) + } + else{ + convert_fc64_1_to_item32_1_nswap_guts(u_) + } + + //convert remainder + for (; i < nsamps; i++){ + output[i] = fc64_to_item32(input[i], scale_factor); + } +} + +DECLARE_CONVERTER(convert_fc64_1_to_item32_1_bswap, PRIORITY_CUSTOM){ + const fc64_t *input = reinterpret_cast(inputs[0]); + item32_t *output = reinterpret_cast(outputs[0]); + + const __m128d scalar = _mm_set1_pd(scale_factor); + + #define convert_fc64_1_to_item32_1_bswap_guts(_al_) \ + for (; i+4 < nsamps; i+=4){ \ + /* load from input */ \ + __m128d tmp0 = _mm_load ## _al_ ## pd(reinterpret_cast(input+i+0)); \ + __m128d tmp1 = _mm_load ## _al_ ## pd(reinterpret_cast(input+i+1)); \ + __m128d tmp2 = _mm_load ## _al_ ## pd(reinterpret_cast(input+i+2)); \ + __m128d tmp3 = _mm_load ## _al_ ## pd(reinterpret_cast(input+i+3)); \ + \ + /* convert and scale */ \ + __m128i tmpi0 = _mm_cvttpd_epi32(_mm_mul_pd(tmp0, scalar)); \ + __m128i tmpi1 = _mm_cvttpd_epi32(_mm_mul_pd(tmp1, scalar)); \ + __m128i tmpilo = _mm_unpacklo_epi64(tmpi0, tmpi1); \ + __m128i tmpi2 = _mm_cvttpd_epi32(_mm_mul_pd(tmp2, scalar)); \ + __m128i tmpi3 = _mm_cvttpd_epi32(_mm_mul_pd(tmp3, scalar)); \ + __m128i tmpihi = _mm_unpacklo_epi64(tmpi2, tmpi3); \ + \ + /* pack + byteswap -> byteswap 16 bit words */ \ + __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ + tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ + \ + /* store to output */ \ + _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \ + } \ + + size_t i = 0; + + //dispatch according to alignment + if ((size_t(input) & 0xf) == 0){ + convert_fc64_1_to_item32_1_bswap_guts(_) + } + else{ + convert_fc64_1_to_item32_1_bswap_guts(u_) + } + + //convert remainder + for (; i < nsamps; i++){ + output[i] = uhd::byteswap(fc64_to_item32(input[i], scale_factor)); + } +} + +DECLARE_CONVERTER(convert_item32_1_to_fc64_1_nswap, PRIORITY_CUSTOM){ + const item32_t *input = reinterpret_cast(inputs[0]); + fc64_t *output = reinterpret_cast(outputs[0]); + + const __m128d scalar = _mm_set1_pd(scale_factor/(1 << 16)); + const __m128i zeroi = _mm_setzero_si128(); + + #define convert_item32_1_to_fc64_1_nswap_guts(_al_) \ + for (; i+4 < nsamps; i+=4){ \ + /* load from input */ \ + __m128i tmpi = _mm_loadu_si128(reinterpret_cast(input+i)); \ + \ + /* unpack + swap 16-bit pairs */ \ + tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ + __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \ + __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \ + \ + /* convert and scale */ \ + __m128d tmp0 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar); \ + tmpilo = _mm_unpackhi_epi64(tmpilo, zeroi); \ + __m128d tmp1 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar); \ + __m128d tmp2 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar); \ + tmpihi = _mm_unpackhi_epi64(tmpihi, zeroi); \ + __m128d tmp3 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar); \ + \ + /* store to output */ \ + _mm_store ## _al_ ## pd(reinterpret_cast(output+i+0), tmp0); \ + _mm_store ## _al_ ## pd(reinterpret_cast(output+i+1), tmp1); \ + _mm_store ## _al_ ## pd(reinterpret_cast(output+i+2), tmp2); \ + _mm_store ## _al_ ## pd(reinterpret_cast(output+i+3), tmp3); \ + } \ + + size_t i = 0; + + //dispatch according to alignment + if ((size_t(output) & 0xf) == 0){ + convert_item32_1_to_fc64_1_nswap_guts(_) + } + else{ + convert_item32_1_to_fc64_1_nswap_guts(u_) + } + + //convert remainder + for (; i < nsamps; i++){ + output[i] = item32_to_fc64(input[i], scale_factor); + } +} + +DECLARE_CONVERTER(convert_item32_1_to_fc64_1_bswap, PRIORITY_CUSTOM){ + const item32_t *input = reinterpret_cast(inputs[0]); + fc64_t *output = reinterpret_cast(outputs[0]); + + const __m128d scalar = _mm_set1_pd(scale_factor/(1 << 16)); + const __m128i zeroi = _mm_setzero_si128(); + + #define convert_item32_1_to_fc64_1_bswap_guts(_al_) \ + for (; i+4 < nsamps; i+=4){ \ + /* load from input */ \ + __m128i tmpi = _mm_loadu_si128(reinterpret_cast(input+i)); \ + \ + /* byteswap + unpack -> byteswap 16 bit words */ \ + tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ + __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \ + __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \ + \ + /* convert and scale */ \ + __m128d tmp0 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar); \ + tmpilo = _mm_unpackhi_epi64(tmpilo, zeroi); \ + __m128d tmp1 = _mm_mul_pd(_mm_cvtepi32_pd(tmpilo), scalar); \ + __m128d tmp2 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar); \ + tmpihi = _mm_unpackhi_epi64(tmpihi, zeroi); \ + __m128d tmp3 = _mm_mul_pd(_mm_cvtepi32_pd(tmpihi), scalar); \ + \ + /* store to output */ \ + _mm_store ## _al_ ## pd(reinterpret_cast(output+i+0), tmp0); \ + _mm_store ## _al_ ## pd(reinterpret_cast(output+i+1), tmp1); \ + _mm_store ## _al_ ## pd(reinterpret_cast(output+i+2), tmp2); \ + _mm_store ## _al_ ## pd(reinterpret_cast(output+i+3), tmp3); \ + } \ + + size_t i = 0; + + //dispatch according to alignment + if ((size_t(output) & 0xf) == 0){ + convert_item32_1_to_fc64_1_bswap_guts(_) + } + else{ + convert_item32_1_to_fc64_1_bswap_guts(u_) + } + + //convert remainder + for (; i < nsamps; i++){ + output[i] = item32_to_fc64(uhd::byteswap(input[i]), scale_factor); + } +} diff --git a/host/lib/convert/convert_with_sse2.cpp b/host/lib/convert/convert_with_sse2.cpp deleted file mode 100644 index 0649baab4..000000000 --- a/host/lib/convert/convert_with_sse2.cpp +++ /dev/null @@ -1,196 +0,0 @@ -// -// Copyright 2011-2011 Ettus Research LLC -// -// This program is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program. If not, see . -// - -#include "convert_common.hpp" -#include -#include - -using namespace uhd::convert; - -DECLARE_CONVERTER(convert_fc32_1_to_item32_1_nswap, PRIORITY_CUSTOM){ - const fc32_t *input = reinterpret_cast(inputs[0]); - item32_t *output = reinterpret_cast(outputs[0]); - - const __m128 scalar = _mm_set_ps1(float(scale_factor)); - - #define convert_fc32_1_to_item32_1_nswap_guts(_al_) \ - for (; i+4 < nsamps; i+=4){ \ - /* load from input */ \ - __m128 tmplo = _mm_load ## _al_ ## ps(reinterpret_cast(input+i+0)); \ - __m128 tmphi = _mm_load ## _al_ ## ps(reinterpret_cast(input+i+2)); \ - \ - /* convert and scale */ \ - __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar)); \ - __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar)); \ - \ - /* pack + swap 16-bit pairs */ \ - __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ - tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ - tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ - \ - /* store to output */ \ - _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \ - } \ - - size_t i = 0; - - //dispatch according to alignment - switch (size_t(input) & 0xf){ - case 0x8: - output[i] = fc32_to_item32(input[i], float(scale_factor)); i++; - case 0x0: - convert_fc32_1_to_item32_1_nswap_guts(_) - break; - default: convert_fc32_1_to_item32_1_nswap_guts(u_) - } - - //convert remainder - for (; i < nsamps; i++){ - output[i] = fc32_to_item32(input[i], float(scale_factor)); - } -} - -DECLARE_CONVERTER(convert_fc32_1_to_item32_1_bswap, PRIORITY_CUSTOM){ - const fc32_t *input = reinterpret_cast(inputs[0]); - item32_t *output = reinterpret_cast(outputs[0]); - - const __m128 scalar = _mm_set_ps1(float(scale_factor)); - - #define convert_fc32_1_to_item32_1_bswap_guts(_al_) \ - for (; i+4 < nsamps; i+=4){ \ - /* load from input */ \ - __m128 tmplo = _mm_load ## _al_ ## ps(reinterpret_cast(input+i+0)); \ - __m128 tmphi = _mm_load ## _al_ ## ps(reinterpret_cast(input+i+2)); \ - \ - /* convert and scale */ \ - __m128i tmpilo = _mm_cvtps_epi32(_mm_mul_ps(tmplo, scalar)); \ - __m128i tmpihi = _mm_cvtps_epi32(_mm_mul_ps(tmphi, scalar)); \ - \ - /* pack + byteswap -> byteswap 16 bit words */ \ - __m128i tmpi = _mm_packs_epi32(tmpilo, tmpihi); \ - tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ - \ - /* store to output */ \ - _mm_storeu_si128(reinterpret_cast<__m128i *>(output+i), tmpi); \ - } \ - - size_t i = 0; - - //dispatch according to alignment - switch (size_t(input) & 0xf){ - case 0x8: - output[i] = uhd::byteswap(fc32_to_item32(input[i], float(scale_factor))); i++; - case 0x0: - convert_fc32_1_to_item32_1_bswap_guts(_) - break; - default: convert_fc32_1_to_item32_1_bswap_guts(u_) - } - - //convert remainder - for (; i < nsamps; i++){ - output[i] = uhd::byteswap(fc32_to_item32(input[i], float(scale_factor))); - } -} - -DECLARE_CONVERTER(convert_item32_1_to_fc32_1_nswap, PRIORITY_CUSTOM){ - const item32_t *input = reinterpret_cast(inputs[0]); - fc32_t *output = reinterpret_cast(outputs[0]); - - const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 16)); - const __m128i zeroi = _mm_setzero_si128(); - - #define convert_item32_1_to_fc32_1_nswap_guts(_al_) \ - for (; i+4 < nsamps; i+=4){ \ - /* load from input */ \ - __m128i tmpi = _mm_loadu_si128(reinterpret_cast(input+i)); \ - \ - /* unpack + swap 16-bit pairs */ \ - tmpi = _mm_shufflelo_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ - tmpi = _mm_shufflehi_epi16(tmpi, _MM_SHUFFLE(2, 3, 0, 1)); \ - __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \ - __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \ - \ - /* convert and scale */ \ - __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar); \ - __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar); \ - \ - /* store to output */ \ - _mm_store ## _al_ ## ps(reinterpret_cast(output+i+0), tmplo); \ - _mm_store ## _al_ ## ps(reinterpret_cast(output+i+2), tmphi); \ - } \ - - size_t i = 0; - - //dispatch according to alignment - switch (size_t(output) & 0xf){ - case 0x8: - output[i] = item32_to_fc32(input[i], float(scale_factor)); i++; - case 0x0: - convert_item32_1_to_fc32_1_nswap_guts(_) - break; - default: convert_item32_1_to_fc32_1_nswap_guts(u_) - } - - //convert remainder - for (; i < nsamps; i++){ - output[i] = item32_to_fc32(input[i], float(scale_factor)); - } -} - -DECLARE_CONVERTER(convert_item32_1_to_fc32_1_bswap, PRIORITY_CUSTOM){ - const item32_t *input = reinterpret_cast(inputs[0]); - fc32_t *output = reinterpret_cast(outputs[0]); - - const __m128 scalar = _mm_set_ps1(float(scale_factor)/(1 << 16)); - const __m128i zeroi = _mm_setzero_si128(); - - #define convert_item32_1_to_fc32_1_bswap_guts(_al_) \ - for (; i+4 < nsamps; i+=4){ \ - /* load from input */ \ - __m128i tmpi = _mm_loadu_si128(reinterpret_cast(input+i)); \ - \ - /* byteswap + unpack -> byteswap 16 bit words */ \ - tmpi = _mm_or_si128(_mm_srli_epi16(tmpi, 8), _mm_slli_epi16(tmpi, 8)); \ - __m128i tmpilo = _mm_unpacklo_epi16(zeroi, tmpi); /* value in upper 16 bits */ \ - __m128i tmpihi = _mm_unpackhi_epi16(zeroi, tmpi); \ - \ - /* convert and scale */ \ - __m128 tmplo = _mm_mul_ps(_mm_cvtepi32_ps(tmpilo), scalar); \ - __m128 tmphi = _mm_mul_ps(_mm_cvtepi32_ps(tmpihi), scalar); \ - \ - /* store to output */ \ - _mm_store ## _al_ ## ps(reinterpret_cast(output+i+0), tmplo); \ - _mm_store ## _al_ ## ps(reinterpret_cast(output+i+2), tmphi); \ - } \ - - size_t i = 0; - - //dispatch according to alignment - switch (size_t(output) & 0xf){ - case 0x8: - output[i] = item32_to_fc32(uhd::byteswap(input[i]), float(scale_factor)); i++; - case 0x0: - convert_item32_1_to_fc32_1_bswap_guts(_) - break; - default: convert_item32_1_to_fc32_1_bswap_guts(u_) - } - - //convert remainder - for (; i < nsamps; i++){ - output[i] = item32_to_fc32(uhd::byteswap(input[i]), float(scale_factor)); - } -} -- cgit v1.2.3 From 60edbd7f59c873ce69ddb5b337e640f479a28321 Mon Sep 17 00:00:00 2001 From: Josh Blum Date: Sun, 17 Jul 2011 20:12:56 -0700 Subject: udp: squashed the wsa work and added documentation work --- host/docs/transport.rst | 42 +++-- host/lib/transport/CMakeLists.txt | 10 +- host/lib/transport/udp_wsa_zero_copy.cpp | 281 +++++++++++++++++++++++++++++++ 3 files changed, 320 insertions(+), 13 deletions(-) create mode 100644 host/lib/transport/udp_wsa_zero_copy.cpp (limited to 'host/lib') diff --git a/host/docs/transport.rst b/host/docs/transport.rst index f28d1efcb..b601cd8ff 100644 --- a/host/docs/transport.rst +++ b/host/docs/transport.rst @@ -19,7 +19,10 @@ The transport parameters are defined below for the various transports in the UHD ------------------------------------------------------------------------ UDP transport (sockets) ------------------------------------------------------------------------ -The UDP transport is implemented with standard user-space/Berkeley sockets. +The UDP transport is implemented with user-space sockets: + +* **UNIX:** standard Berkeley sockets API using send()/recv() +* **Windows:** Windows Sockets API (WSA) using overlapped IO ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Transport parameters @@ -31,21 +34,18 @@ The following parameters can be used to alter the transport's default behavior: * **send_frame_size:** The size of a single send buffer in bytes * **num_send_frames:** The number of send buffers to allocate -**Note1:** num_recv_frames and num_send_frames do not affect performance. +**Note1:** +num_recv_frames does not affect performance (all platforms). + +**Note2:** +num_send_frames does not affect performance (UNIX only). -**Note2:** recv_frame_size and send_frame_size can be used to +**Note3:** +recv_frame_size and send_frame_size can be used to increase or decrease the maximum number of samples per packet. The frame sizes default to an MTU of 1472 bytes per IP/UDP packet, and may be increased if permitted by your network hardware. -**Note3:** For lower latency at low sample rates, use smaller buffers - -**Note4:** For overall lower latency, look for Interrupt Coalescing settings -for your OS and ethernet chipset. It seems the Intel ethernet chipsets offer -fine-grained control in Linux. Also, consult: - -* http://publib.boulder.ibm.com/infocenter/pseries/v5r3/index.jsp?topic=/com.ibm.aix.prftungd/doc/prftungd/interrupt_coal.htm - ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Flow control parameters ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -72,6 +72,25 @@ The following parameters can be used to alter socket's buffer sizes: **Note:** Large send buffers tend to decrease transmit performance. +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Latency Optimization +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +Latency is a measurement of the time it takes a sample to travel between the host and device. +Most computer hardware and software is bandwidth optimized which may negatively affect latency. +If your application has strict latency requirements, please consider the following notes: + +**Note1:** +The time taken by the device to populate a packet is proportional to the sample rate. +Therefore, to improve receive latency, configure the transport for a smaller frame size. + +**Note2:** +For overall latency improvements, +look for "Interrupt Coalescing" settings for your OS and ethernet chipset. +It seems the Intel ethernet chipsets offer fine-grained control in Linux. +Also, consult: + +* http://publib.boulder.ibm.com/infocenter/pseries/v5r3/index.jsp?topic=/com.ibm.aix.prftungd/doc/prftungd/interrupt_coal.htm + ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Linux specific notes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -90,7 +109,6 @@ Windows specific notes ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ On Windows, it is important to change the default UDP behavior such that 1500 byte packets still travel through the fast path of the sockets stack. - FastSendDatagramThreshold registry key to change documented here: * http://www.microsoft.com/windows/windowsmedia/howto/articles/optimize_web.aspx#appendix_e diff --git a/host/lib/transport/CMakeLists.txt b/host/lib/transport/CMakeLists.txt index b1821956c..866ade75f 100644 --- a/host/lib/transport/CMakeLists.txt +++ b/host/lib/transport/CMakeLists.txt @@ -79,6 +79,15 @@ SET_SOURCE_FILES_PROPERTIES( PROPERTIES COMPILE_DEFINITIONS "${IF_ADDRS_DEFS}" ) +######################################################################## +# Setup UDP +######################################################################## +IF(WIN32) + LIBUHD_APPEND_SOURCES(${CMAKE_CURRENT_SOURCE_DIR}/udp_wsa_zero_copy.cpp) +ELSE() + LIBUHD_APPEND_SOURCES(${CMAKE_CURRENT_SOURCE_DIR}/udp_zero_copy.cpp) +ENDIF() + #On windows, the boost asio implementation uses the winsock2 library. #Note: we exclude the .lib extension for cygwin and mingw platforms. IF(WIN32) @@ -97,6 +106,5 @@ LIBUHD_APPEND_SOURCES( ${CMAKE_CURRENT_SOURCE_DIR}/buffer_pool.cpp ${CMAKE_CURRENT_SOURCE_DIR}/if_addrs.cpp ${CMAKE_CURRENT_SOURCE_DIR}/udp_simple.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/udp_zero_copy.cpp ${CMAKE_CURRENT_SOURCE_DIR}/usb_zero_copy_wrapper.cpp ) diff --git a/host/lib/transport/udp_wsa_zero_copy.cpp b/host/lib/transport/udp_wsa_zero_copy.cpp new file mode 100644 index 000000000..ccfed38ea --- /dev/null +++ b/host/lib/transport/udp_wsa_zero_copy.cpp @@ -0,0 +1,281 @@ +// +// Copyright 2010-2011 Ettus Research LLC +// +// This program is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program. If not, see . +// + +#include "udp_common.hpp" +#include +#include //mtu +#include +#include +#include +#include +#include +#include + +using namespace uhd; +using namespace uhd::transport; +namespace asio = boost::asio; + +//A reasonable number of frames for send/recv and async/sync +static const size_t DEFAULT_NUM_FRAMES = 32; + +/*********************************************************************** + * Static initialization to take care of WSA init and cleanup + **********************************************************************/ +struct uhd_wsa_control{ + uhd_wsa_control(void){ + WSADATA wsaData; + WSAStartup(MAKEWORD(2, 2), &wsaData); /*windows socket startup */ + } + + ~uhd_wsa_control(void){ + WSACleanup(); + } +}; + +/*********************************************************************** + * Reusable managed receiver buffer: + * - Initialize with memory and a release callback. + * - Call get new with a length in bytes to re-use. + **********************************************************************/ +class udp_zero_copy_asio_mrb : public managed_recv_buffer{ +public: + udp_zero_copy_asio_mrb(void *mem, bounded_buffer &pending): + _mem(mem), _len(0), _pending(pending){/* NOP */} + + void release(void){ + if (_len == 0) return; + _pending.push_with_haste(this); + _len = 0; + } + + sptr get_new(size_t len){ + _len = len; + return make_managed_buffer(this); + } + + template T cast(void) const{return static_cast(_mem);} + +private: + const void *get_buff(void) const{return _mem;} + size_t get_size(void) const{return _len;} + + void *_mem; + size_t _len; + bounded_buffer &_pending; +}; + +/*********************************************************************** + * Reusable managed send buffer: + * - committing the buffer calls the asynchronous socket send + * - getting a new buffer performs the blocking wait for completion + **********************************************************************/ +class udp_zero_copy_asio_msb : public managed_send_buffer{ +public: + udp_zero_copy_asio_msb(void *mem, int sock_fd, const size_t frame_size): + _sock_fd(sock_fd), _frame_size(frame_size), _committed(false) + { + _wsa_buff.buf = reinterpret_cast(mem); + ZeroMemory(&_overlapped, sizeof(_overlapped)); + _overlapped.hEvent = WSACreateEvent(); + UHD_ASSERT_THROW(_overlapped.hEvent != WSA_INVALID_EVENT); + this->commit(0); //makes buffer available via get_new + } + + ~udp_zero_copy_asio_msb(void){ + WSACloseEvent(_overlapped.hEvent); + } + + UHD_INLINE void commit(size_t len){ + if (_committed) return; + _committed = true; + _wsa_buff.len = len; + if (len == 0) WSASetEvent(_overlapped.hEvent); + else WSASend(_sock_fd, &_wsa_buff, 1, NULL, 0, &_overlapped, NULL); + } + + UHD_INLINE sptr get_new(const double timeout, size_t &index){ + const DWORD result = WSAWaitForMultipleEvents( + 1, &_overlapped.hEvent, true, DWORD(timeout*1000), true + ); + if (result == WSA_WAIT_TIMEOUT) return managed_send_buffer::sptr(); + index++; //advances the caller's buffer + + WSAResetEvent(_overlapped.hEvent); + _committed = false; + _wsa_buff.len = _frame_size; + return make_managed_buffer(this); + } + +private: + void *get_buff(void) const{return _wsa_buff.buf;} + size_t get_size(void) const{return _wsa_buff.len;} + + int _sock_fd; + const size_t _frame_size; + bool _committed; + WSAOVERLAPPED _overlapped; + WSABUF _wsa_buff; +}; + +/*********************************************************************** + * Zero Copy UDP implementation with WSA: + * + * This is not a true zero copy implementation as each + * send and recv requires a copy operation to/from userspace. + * + * For receive, use a blocking recv() call on the socket. + * This has better performance than the overlapped IO. + * For send, use overlapped IO to submit async sends. + **********************************************************************/ +class udp_zero_copy_wsa_impl : public udp_zero_copy{ +public: + typedef boost::shared_ptr sptr; + + udp_zero_copy_wsa_impl( + const std::string &addr, + const std::string &port, + const device_addr_t &hints + ): + _recv_frame_size(size_t(hints.cast("recv_frame_size", udp_simple::mtu))), + _num_recv_frames(size_t(hints.cast("num_recv_frames", DEFAULT_NUM_FRAMES))), + _send_frame_size(size_t(hints.cast("send_frame_size", udp_simple::mtu))), + _num_send_frames(size_t(hints.cast("num_send_frames", DEFAULT_NUM_FRAMES))), + _recv_buffer_pool(buffer_pool::make(_num_recv_frames, _recv_frame_size)), + _send_buffer_pool(buffer_pool::make(_num_send_frames, _send_frame_size)), + _pending_recv_buffs(_num_recv_frames), + _next_send_buff_index(0) + { + UHD_MSG(status) << boost::format("Creating WSA UDP transport for %s:%s") % addr % port << std::endl; + static uhd_wsa_control uhd_wsa; //makes wsa start happen via lazy initialization + + UHD_ASSERT_THROW(_num_send_frames <= WSA_MAXIMUM_WAIT_EVENTS); + + //resolve the address + asio::io_service io_service; + asio::ip::udp::resolver resolver(io_service); + asio::ip::udp::resolver::query query(asio::ip::udp::v4(), addr, port); + asio::ip::udp::endpoint receiver_endpoint = *resolver.resolve(query); + + //create the socket + _sock_fd = WSASocket(AF_INET, SOCK_DGRAM, IPPROTO_UDP, NULL, 0, WSA_FLAG_OVERLAPPED); + if (_sock_fd == INVALID_SOCKET){ + const DWORD error = WSAGetLastError(); + throw uhd::os_error(str(boost::format("WSASocket() failed with error %d") % error)); + } + + //set the socket non-blocking for recv + u_long mode = 1; + ioctlsocket(_sock_fd, FIONBIO, &mode); + + //resize the socket buffers + const int recv_buff_size = int(hints.cast("recv_buff_size", 0.0)); + const int send_buff_size = int(hints.cast("send_buff_size", 0.0)); + if (recv_buff_size > 0) setsockopt(_sock_fd, SOL_SOCKET, SO_RCVBUF, (const char *)&recv_buff_size, sizeof(recv_buff_size)); + if (send_buff_size > 0) setsockopt(_sock_fd, SOL_SOCKET, SO_SNDBUF, (const char *)&send_buff_size, sizeof(send_buff_size)); + + //connect the socket so we can send/recv + const asio::ip::udp::endpoint::data_type &servaddr = *receiver_endpoint.data(); + if (WSAConnect(_sock_fd, (const struct sockaddr *)&servaddr, sizeof(servaddr), NULL, NULL, NULL, NULL) != 0){ + const DWORD error = WSAGetLastError(); + closesocket(_sock_fd); + throw uhd::os_error(str(boost::format("WSAConnect() failed with error %d") % error)); + } + + //allocate re-usable managed receive buffers + for (size_t i = 0; i < get_num_recv_frames(); i++){ + _mrb_pool.push_back(boost::shared_ptr( + new udp_zero_copy_asio_mrb(_recv_buffer_pool->at(i), _pending_recv_buffs) + )); + _pending_recv_buffs.push_with_haste(_mrb_pool.back().get()); + } + + //allocate re-usable managed send buffers + for (size_t i = 0; i < get_num_send_frames(); i++){ + _msb_pool.push_back(boost::shared_ptr( + new udp_zero_copy_asio_msb(_send_buffer_pool->at(i), _sock_fd, get_send_frame_size()) + )); + } + } + + ~udp_zero_copy_wsa_impl(void){ + closesocket(_sock_fd); + } + + /******************************************************************* + * Receive implementation: + * + * Perform a non-blocking receive for performance, + * and then fall back to a blocking receive with timeout. + * Return the managed receive buffer with the new length. + * When the caller is finished with the managed buffer, + * the managed receive buffer is released back into the queue. + ******************************************************************/ + managed_recv_buffer::sptr get_recv_buff(double timeout){ + udp_zero_copy_asio_mrb *mrb = NULL; + if (_pending_recv_buffs.pop_with_timed_wait(mrb, timeout)){ + + ssize_t ret = ::recv(_sock_fd, mrb->cast(), _recv_frame_size, 0); + if (ret > 0) return mrb->get_new(ret); + + if (wait_for_recv_ready(_sock_fd, timeout)) return mrb->get_new( + ::recv(_sock_fd, mrb->cast(), _recv_frame_size, 0) + ); + + _pending_recv_buffs.push_with_haste(mrb); //timeout: return the managed buffer to the queue + } + return managed_recv_buffer::sptr(); + } + + size_t get_num_recv_frames(void) const {return _num_recv_frames;} + size_t get_recv_frame_size(void) const {return _recv_frame_size;} + + /******************************************************************* + * Send implementation: + * Block on the managed buffer's get call and advance the index. + ******************************************************************/ + managed_send_buffer::sptr get_send_buff(double timeout){ + if (_next_send_buff_index == _num_send_frames) _next_send_buff_index = 0; + return _msb_pool[_next_send_buff_index]->get_new(timeout, _next_send_buff_index); + } + + size_t get_num_send_frames(void) const {return _num_send_frames;} + size_t get_send_frame_size(void) const {return _send_frame_size;} + +private: + //memory management -> buffers and fifos + const size_t _recv_frame_size, _num_recv_frames; + const size_t _send_frame_size, _num_send_frames; + buffer_pool::sptr _recv_buffer_pool, _send_buffer_pool; + std::vector > _msb_pool; + std::vector > _mrb_pool; + bounded_buffer _pending_recv_buffs; + size_t _next_send_buff_index; + + //socket guts + SOCKET _sock_fd; +}; + +/*********************************************************************** + * UDP zero copy make function + **********************************************************************/ +udp_zero_copy::sptr udp_zero_copy::make( + const std::string &addr, + const std::string &port, + const device_addr_t &hints +){ + return sptr(new udp_zero_copy_wsa_impl(addr, port, hints)); +} -- cgit v1.2.3