From f4ef1284d754ecd907dd4ed3072f8be65b26f2de Mon Sep 17 00:00:00 2001 From: "Matthias P. Braendli" Date: Fri, 9 Feb 2018 11:42:51 +0100 Subject: Add charset conversion library --- lib/charset/charset.h | 89 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 89 insertions(+) create mode 100644 lib/charset/charset.h (limited to 'lib/charset/charset.h') diff --git a/lib/charset/charset.h b/lib/charset/charset.h new file mode 100644 index 0000000..6ff19bc --- /dev/null +++ b/lib/charset/charset.h @@ -0,0 +1,89 @@ +/* + Copyright (C) 2015 Matthias P. Braendli (http://opendigitalradio.org) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ +/*! + \file charset.h + \brief Define the EBU charset according to ETSI TS 101 756v1.8.1 for DLS encoding + + \author Matthias P. Braendli + \author Lindsay Cornell +*/ + +#ifndef __CHARSET_H_ +#define __CHARSET_H_ + +#include "utf8.h" +#include +#include +#include +#include + +#define CHARSET_TABLE_OFFSET 1 // NUL at index 0 cannot be represented +#define CHARSET_TABLE_ENTRIES (256 - CHARSET_TABLE_OFFSET) +extern const char* utf8_encoded_EBU_Latin[CHARSET_TABLE_ENTRIES]; + +class CharsetConverter +{ + public: + CharsetConverter() { + /*! Build the converstion table that contains the known code points, + * at the indices corresponding to the EBU Latin table + */ + using namespace std; + for (size_t i = 0; i < CHARSET_TABLE_ENTRIES; i++) { + string table_entry(utf8_encoded_EBU_Latin[i]); + string::iterator it = table_entry.begin(); + uint32_t code_point = utf8::next(it, table_entry.end()); + m_conversion_table.push_back(code_point); + } + } + + /*! Convert a UTF-8 encoded text line into an EBU Latin encoded byte stream + */ + std::string convert(std::string line_utf8) { + using namespace std; + + // check for invalid utf-8, we only convert up to the first error + string::iterator end_it = utf8::find_invalid(line_utf8.begin(), line_utf8.end()); + + // Convert it to utf-32 + vector utf32line; + utf8::utf8to32(line_utf8.begin(), end_it, back_inserter(utf32line)); + + string encoded_line(utf32line.size(), '0'); + + // Try to convert each codepoint + for (size_t i = 0; i < utf32line.size(); i++) { + vector::iterator iter = find(m_conversion_table.begin(), + m_conversion_table.end(), utf32line[i]); + if (iter != m_conversion_table.end()) { + size_t index = std::distance(m_conversion_table.begin(), iter); + + encoded_line[i] = (char)(index + CHARSET_TABLE_OFFSET); + } + else { + encoded_line[i] = ' '; + } + } + return encoded_line; + } + + private: + + std::vector m_conversion_table; +}; + +#endif -- cgit v1.2.3