From e348353c4f5acef6a5ece11e9c336df4c15d65e1 Mon Sep 17 00:00:00 2001 From: Michael West Date: Wed, 29 Mar 2017 13:10:32 -0700 Subject: Implement worker threads to offload conversion of data and transport I/O for send() calls. - One worker thread per channel provides for improved scalability --- host/lib/transport/super_send_packet_handler.hpp | 211 ++++++++++++++++++----- 1 file changed, 170 insertions(+), 41 deletions(-) (limited to 'host/lib/transport') diff --git a/host/lib/transport/super_send_packet_handler.hpp b/host/lib/transport/super_send_packet_handler.hpp index 0acc8df4b..431cbf216 100644 --- a/host/lib/transport/super_send_packet_handler.hpp +++ b/host/lib/transport/super_send_packet_handler.hpp @@ -29,10 +29,13 @@ #include #include #include +#include #include #include #include #include +#include +#include #include #include @@ -49,6 +52,9 @@ namespace uhd { namespace transport { namespace sph { +static const size_t MAX_INTERLEAVE = 4; +static const double GET_BUFF_TIMEOUT = 0.1; + /*********************************************************************** * Super send packet handler * @@ -68,19 +74,39 @@ public: * \param size the number of transport channels */ send_packet_handler(const size_t size = 1): - _next_packet_seq(0), _cached_metadata(false) + _next_packet_seq(0), _cached_metadata(false) { this->set_enable_trailer(true); this->resize(size); } ~send_packet_handler(void){ - /* NOP */ + UHD_SAFE_CALL( + for (size_t i = 0; i < _worker_data.size(); i++) + { + _worker_data[i]->stop = true; + } + _worker_thread_group.join_all(); + ); } //! Resize the number of transport channels void resize(const size_t size){ if (this->size() == size) return; + + // Stop all worker threads + for (size_t i = 0; i < _worker_data.size(); i++) + { + _worker_data[i]->stop = true; + } + _worker_thread_group.join_all(); + _worker_threads.resize(size); + _worker_data.resize(size); + for (size_t i = 0; i < size; i++) + { + _worker_data[i] = boost::make_shared(); + } + _props.resize(size); static const uint64_t zero = 0; _zero_buffs.resize(size, &zero); @@ -145,7 +171,15 @@ public: * \param get_buff the getter function */ void set_xport_chan_get_buff(const size_t xport_chan, const get_buff_type &get_buff){ + if (_worker_threads[xport_chan]) + { + _worker_thread_group.remove_thread(_worker_threads[xport_chan]); + _worker_data[xport_chan]->stop = true; + _worker_threads[xport_chan]->join(); + _worker_data[xport_chan]->stop = false; + } _props.at(xport_chan).get_buff = get_buff; + _worker_threads[xport_chan] = _worker_thread_group.create_thread(boost::bind(&send_packet_handler::worker, this, xport_chan)); } //! Set the conversion routine for all channels @@ -381,63 +415,147 @@ private: if_packet_info.num_payload_words32 = (if_packet_info.num_payload_bytes + 3/*round up*/)/sizeof(uint32_t); if_packet_info.packet_count = _next_packet_seq; - //get a buffer for each channel or timeout - BOOST_FOREACH(xport_chan_props_type &props, _props){ - if (not props.buff) props.buff = props.get_buff(timeout); - if (not props.buff) return 0; //timeout + // wait for all worker threads to be ready or timeout + boost::system_time expiration = boost::get_system_time() + boost::posix_time::milliseconds(long(timeout * 1000)); + for (size_t i = 0; i < this->size(); i++) + { + while (not _worker_data[i]->ready) + { + if (boost::get_system_time() > expiration) + { + return 0; + } + } + _worker_data[i]->ready = false; } - //setup the data to share with converter threads + //setup the data to share with worker threads _convert_nsamps = nsamps_per_buff; _convert_buffs = &buffs; _convert_buffer_offset_bytes = buffer_offset_bytes; _convert_if_packet_info = &if_packet_info; - //perform N channels of conversion - for (size_t i = 0; i < this->size(); i++) { - convert_to_in_buff(i); + //start N channels of conversion + for (size_t i = 0; i < this->size(); i++) + { + _worker_data[i]->go = true; + } + + //make sure any sleeping worker threads are woken up + for (size_t i = 0; i < this->size(); i++) + { + // Acquiring the lock used by the condition variable + // takes too long, so do a spin wait. If the go flag + // is not cleared by this point, it will be cleared + // immediately by the worker thread when it wakes up. + while (_worker_data[i]->go) + { + _worker_data[i]->data_ready.notify_one(); + } + } + + //wait for all worker threads to be done + for (size_t i = 0; i < this->size(); i++) + { + //TODO: Implement a better wait strategy + //busy loop give fastest response, but these are just wasted cycles + while (not _worker_data[i]->done) {} + _worker_data[i]->done = false; } _next_packet_seq++; //increment sequence after commits return nsamps_per_buff; } - /*! Run the conversion from the internal buffers to the user's input - * buffer. + /*! Worker thread routine. * + * - Gets an internal data buffer * - Calls the converter * - Releases internal data buffers - * - Updates read/write pointers */ - UHD_INLINE void convert_to_in_buff(const size_t index) + void worker(const size_t index) { - //shortcut references to local data structures - managed_send_buffer::sptr &buff = _props[index].buff; - vrt::if_packet_info_t if_packet_info = *_convert_if_packet_info; - const tx_streamer::buffs_type &buffs = *_convert_buffs; - - //fill IO buffs with pointers into the output buffer - const void *io_buffs[4/*max interleave*/]; - for (size_t i = 0; i < _num_inputs; i++){ - const char *b = reinterpret_cast(buffs[index*_num_inputs + i]); - io_buffs[i] = b + _convert_buffer_offset_bytes; + //maximum number of cycles to spin before waiting on condition variable + //the value of 30000000 was derived from 15ms on a 10 GHz CPU divided by 5 cycles per loop + //the assumption is that anything held up for 15ms can wait + static const size_t MAX_SPIN_CYCLES = 30000000; + + //maximum amount of time to wait before checking the stop flag + static const double MAX_WAIT = 0.1; + + managed_send_buffer::sptr buff; + vrt::if_packet_info_t if_packet_info; + std::vector in_buffs(MAX_INTERLEAVE); + boost::shared_ptr worker_data = _worker_data[index]; + boost::unique_lock lock(worker_data->data_ready_lock); + size_t spins = 0; + + while (not worker_data->stop) + { + if (not buff) + { + buff = _props[index].get_buff(MAX_WAIT); + if (not buff) + { + continue; + } + worker_data->ready = true; + } + + //make sure done flag is cleared by controlling thread before waiting on go signal + if (worker_data->done) + { + continue; + } + + //partial spin lock before wait + while (not worker_data->go and spins < MAX_SPIN_CYCLES) + { + spins++; + } + if (not worker_data->go and + not worker_data->data_ready.timed_wait(lock, boost::posix_time::milliseconds(long(MAX_WAIT*1000)))) + { + continue; + } + // Clear the go flag immediately to let the + // controlling thread know we are not sleeping. + worker_data->go = false; + + //reset the spin count + spins = 0; + + //pack metadata into a vrt header + uint32_t *otw_mem = buff->cast() + _header_offset_words32; + if_packet_info = *_convert_if_packet_info; + if_packet_info.has_sid = _props[index].has_sid; + if_packet_info.sid = _props[index].sid; + _vrt_packer(otw_mem, if_packet_info); + otw_mem += if_packet_info.num_header_words32; + + //prepare the input buffers + for (size_t i = 0; i < _num_inputs; i++) + { + in_buffs[i] = + (reinterpret_cast((*_convert_buffs)[index*_num_inputs + i])) + + _convert_buffer_offset_bytes; + } + + //perform the conversion operation + _converter->conv(in_buffs, otw_mem, _convert_nsamps); + + //let the master know that new data can be prepared + _worker_data[index]->done = true; + + //commit the samples to the zero-copy interface + buff->commit( + (_header_offset_words32 + if_packet_info.num_packet_words32) + * sizeof(uint32_t) + ); + + //release the buffer + buff.reset(); } - const ref_vector in_buffs(io_buffs, _num_inputs); - - //pack metadata into a vrt header - uint32_t *otw_mem = buff->cast() + _header_offset_words32; - if_packet_info.has_sid = _props[index].has_sid; - if_packet_info.sid = _props[index].sid; - _vrt_packer(otw_mem, if_packet_info); - otw_mem += if_packet_info.num_header_words32; - - //perform the conversion operation - _converter->conv(in_buffs, otw_mem, _convert_nsamps); - - //commit the samples to the zero-copy interface - const size_t num_vita_words32 = _header_offset_words32+if_packet_info.num_packet_words32; - buff->commit(num_vita_words32*sizeof(uint32_t)); - buff.reset(); //effectively a release } //! Shared variables for the worker threads @@ -445,7 +563,18 @@ private: const tx_streamer::buffs_type *_convert_buffs; size_t _convert_buffer_offset_bytes; vrt::if_packet_info_t *_convert_if_packet_info; - + struct worker_thread_data_t { + worker_thread_data_t() : ready(false), go(false), done(false), stop(false) {} + boost::atomic_bool ready; + boost::atomic_bool go; + boost::atomic_bool done; + boost::atomic_bool stop; + boost::mutex data_ready_lock; + boost::condition_variable data_ready; + }; + std::vector< boost::shared_ptr > _worker_data; + boost::thread_group _worker_thread_group; + std::vector _worker_threads; }; class send_packet_streamer : public send_packet_handler, public tx_streamer{ -- cgit v1.2.3