diff options
Diffstat (limited to 'src/CharsetTools.cpp')
-rw-r--r-- | src/CharsetTools.cpp | 143 |
1 files changed, 143 insertions, 0 deletions
diff --git a/src/CharsetTools.cpp b/src/CharsetTools.cpp new file mode 100644 index 0000000..d35c121 --- /dev/null +++ b/src/CharsetTools.cpp @@ -0,0 +1,143 @@ +/* + Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 Her Majesty + the Queen in Right of Canada (Communications Research Center Canada) + + Most parts of this file are taken from dablin, + Copyright (C) 2015-2022 Stefan Pöschel + + Copyright (C) 2023 + Matthias P. Braendli, matthias.braendli@mpb.li + + http://opendigitalradio.org + */ +/* + This file is part of ODR-DabMod. + + ODR-DabMod is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as + published by the Free Software Foundation, either version 3 of the + License, or (at your option) any later version. + + ODR-DabMod is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with ODR-DabMod. If not, see <http://www.gnu.org/licenses/>. + */ + +#include <vector> +#include <algorithm> +#include <stdexcept> +#include <string> +#include <ctime> +#include <cstdint> +#include <cstdlib> +#include <cstring> +#include "CharsetTools.h" + +// --- CharsetTools ----------------------------------------------------------------- +const char* CharsetTools::no_char = ""; +const char* CharsetTools::ebu_values_0x00_to_0x1F[] = { + no_char , "\u0118", "\u012E", "\u0172", "\u0102", "\u0116", "\u010E", "\u0218", "\u021A", "\u010A", no_char , no_char , "\u0120", "\u0139" , "\u017B", "\u0143", + "\u0105", "\u0119", "\u012F", "\u0173", "\u0103", "\u0117", "\u010F", "\u0219", "\u021B", "\u010B", "\u0147", "\u011A", "\u0121", "\u013A", "\u017C", no_char +}; +const char* CharsetTools::ebu_values_0x7B_to_0xFF[] = { + /* starting some chars earlier than 0x80 -----> */ "\u00AB", "\u016F", "\u00BB", "\u013D", "\u0126", + "\u00E1", "\u00E0", "\u00E9", "\u00E8", "\u00ED", "\u00EC", "\u00F3", "\u00F2", "\u00FA", "\u00F9", "\u00D1", "\u00C7", "\u015E", "\u00DF", "\u00A1", "\u0178", + "\u00E2", "\u00E4", "\u00EA", "\u00EB", "\u00EE", "\u00EF", "\u00F4", "\u00F6", "\u00FB", "\u00FC", "\u00F1", "\u00E7", "\u015F", "\u011F", "\u0131", "\u00FF", + "\u0136", "\u0145", "\u00A9", "\u0122", "\u011E", "\u011B", "\u0148", "\u0151", "\u0150", "\u20AC", "\u00A3", "\u0024", "\u0100", "\u0112", "\u012A", "\u016A", + "\u0137", "\u0146", "\u013B", "\u0123", "\u013C", "\u0130", "\u0144", "\u0171", "\u0170", "\u00BF", "\u013E", "\u00B0", "\u0101", "\u0113", "\u012B", "\u016B", + "\u00C1", "\u00C0", "\u00C9", "\u00C8", "\u00CD", "\u00CC", "\u00D3", "\u00D2", "\u00DA", "\u00D9", "\u0158", "\u010C", "\u0160", "\u017D", "\u00D0", "\u013F", + "\u00C2", "\u00C4", "\u00CA", "\u00CB", "\u00CE", "\u00CF", "\u00D4", "\u00D6", "\u00DB", "\u00DC", "\u0159", "\u010D", "\u0161", "\u017E", "\u0111", "\u0140", + "\u00C3", "\u00C5", "\u00C6", "\u0152", "\u0177", "\u00DD", "\u00D5", "\u00D8", "\u00DE", "\u014A", "\u0154", "\u0106", "\u015A", "\u0179", "\u0164", "\u00F0", + "\u00E3", "\u00E5", "\u00E6", "\u0153", "\u0175", "\u00FD", "\u00F5", "\u00F8", "\u00FE", "\u014B", "\u0155", "\u0107", "\u015B", "\u017A", "\u0165", "\u0127" +}; + +std::string CharsetTools::ConvertCharEBUToUTF8(const uint8_t value) { + // convert via LUT + if(value <= 0x1F) + return ebu_values_0x00_to_0x1F[value]; + if(value >= 0x7B) + return ebu_values_0x7B_to_0xFF[value - 0x7B]; + + // convert by hand (avoiding a LUT with mostly 1:1 mapping) + switch(value) { + case 0x24: + return "\u0142"; + case 0x5C: + return "\u016E"; + case 0x5E: + return "\u0141"; + case 0x60: + return "\u0104"; + } + + // leave untouched + return std::string((char*) &value, 1); +} + + +std::string CharsetTools::ConvertTextToUTF8(const uint8_t *data, size_t len, int charset, std::string* charset_name) { + // remove undesired chars + std::vector<uint8_t> cleaned_data; + for(size_t i = 0; i < len; i++) { + switch(data[i]) { + case 0x00: // NULL + case 0x0A: // PLB + case 0x0B: // EoH + case 0x1F: // PWB + continue; + default: + cleaned_data.push_back(data[i]); + } + } + + // convert characters + if(charset == 0b0000) { // EBU Latin based + if(charset_name) + *charset_name = "EBU Latin based"; + + std::string result; + for(const uint8_t& c : cleaned_data) + result += ConvertCharEBUToUTF8(c); + return result; + } + + if(charset == 0b1111) { // UTF-8 + if(charset_name) + *charset_name = "UTF-8"; + + return std::string((char*) &cleaned_data[0], cleaned_data.size()); + } + + // ignore unsupported charset + return ""; +} + + +size_t StringTools::UTF8CharsLen(const std::string &s, size_t chars) { + size_t result; + for(result = 0; result < s.size(); result++) { + // if not a continuation byte, handle counter + if((s[result] & 0xC0) != 0x80) { + if(chars == 0) + break; + chars--; + } + } + return result; +} + +size_t StringTools::UTF8Len(const std::string &s) { + // ignore continuation bytes + return std::count_if(s.cbegin(), s.cend(), [](const char c){return (c & 0xC0) != 0x80;}); +} + +std::string StringTools::UTF8Substr(const std::string &s, size_t pos, size_t count) { + std::string result = s; + result.erase(0, UTF8CharsLen(result, pos)); + result.erase(UTF8CharsLen(result, count)); + return result; +} |