aboutsummaryrefslogtreecommitdiffstats
path: root/src/CharsetTools.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/CharsetTools.cpp')
-rw-r--r--src/CharsetTools.cpp143
1 files changed, 143 insertions, 0 deletions
diff --git a/src/CharsetTools.cpp b/src/CharsetTools.cpp
new file mode 100644
index 0000000..d35c121
--- /dev/null
+++ b/src/CharsetTools.cpp
@@ -0,0 +1,143 @@
+/*
+ Copyright (C) 2005, 2006, 2007, 2008, 2009, 2010, 2011 Her Majesty
+ the Queen in Right of Canada (Communications Research Center Canada)
+
+ Most parts of this file are taken from dablin,
+ Copyright (C) 2015-2022 Stefan Pöschel
+
+ Copyright (C) 2023
+ Matthias P. Braendli, matthias.braendli@mpb.li
+
+ http://opendigitalradio.org
+ */
+/*
+ This file is part of ODR-DabMod.
+
+ ODR-DabMod is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as
+ published by the Free Software Foundation, either version 3 of the
+ License, or (at your option) any later version.
+
+ ODR-DabMod is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with ODR-DabMod. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <vector>
+#include <algorithm>
+#include <stdexcept>
+#include <string>
+#include <ctime>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include "CharsetTools.h"
+
+// --- CharsetTools -----------------------------------------------------------------
+const char* CharsetTools::no_char = "";
+const char* CharsetTools::ebu_values_0x00_to_0x1F[] = {
+ no_char , "\u0118", "\u012E", "\u0172", "\u0102", "\u0116", "\u010E", "\u0218", "\u021A", "\u010A", no_char , no_char , "\u0120", "\u0139" , "\u017B", "\u0143",
+ "\u0105", "\u0119", "\u012F", "\u0173", "\u0103", "\u0117", "\u010F", "\u0219", "\u021B", "\u010B", "\u0147", "\u011A", "\u0121", "\u013A", "\u017C", no_char
+};
+const char* CharsetTools::ebu_values_0x7B_to_0xFF[] = {
+ /* starting some chars earlier than 0x80 -----> */ "\u00AB", "\u016F", "\u00BB", "\u013D", "\u0126",
+ "\u00E1", "\u00E0", "\u00E9", "\u00E8", "\u00ED", "\u00EC", "\u00F3", "\u00F2", "\u00FA", "\u00F9", "\u00D1", "\u00C7", "\u015E", "\u00DF", "\u00A1", "\u0178",
+ "\u00E2", "\u00E4", "\u00EA", "\u00EB", "\u00EE", "\u00EF", "\u00F4", "\u00F6", "\u00FB", "\u00FC", "\u00F1", "\u00E7", "\u015F", "\u011F", "\u0131", "\u00FF",
+ "\u0136", "\u0145", "\u00A9", "\u0122", "\u011E", "\u011B", "\u0148", "\u0151", "\u0150", "\u20AC", "\u00A3", "\u0024", "\u0100", "\u0112", "\u012A", "\u016A",
+ "\u0137", "\u0146", "\u013B", "\u0123", "\u013C", "\u0130", "\u0144", "\u0171", "\u0170", "\u00BF", "\u013E", "\u00B0", "\u0101", "\u0113", "\u012B", "\u016B",
+ "\u00C1", "\u00C0", "\u00C9", "\u00C8", "\u00CD", "\u00CC", "\u00D3", "\u00D2", "\u00DA", "\u00D9", "\u0158", "\u010C", "\u0160", "\u017D", "\u00D0", "\u013F",
+ "\u00C2", "\u00C4", "\u00CA", "\u00CB", "\u00CE", "\u00CF", "\u00D4", "\u00D6", "\u00DB", "\u00DC", "\u0159", "\u010D", "\u0161", "\u017E", "\u0111", "\u0140",
+ "\u00C3", "\u00C5", "\u00C6", "\u0152", "\u0177", "\u00DD", "\u00D5", "\u00D8", "\u00DE", "\u014A", "\u0154", "\u0106", "\u015A", "\u0179", "\u0164", "\u00F0",
+ "\u00E3", "\u00E5", "\u00E6", "\u0153", "\u0175", "\u00FD", "\u00F5", "\u00F8", "\u00FE", "\u014B", "\u0155", "\u0107", "\u015B", "\u017A", "\u0165", "\u0127"
+};
+
+std::string CharsetTools::ConvertCharEBUToUTF8(const uint8_t value) {
+ // convert via LUT
+ if(value <= 0x1F)
+ return ebu_values_0x00_to_0x1F[value];
+ if(value >= 0x7B)
+ return ebu_values_0x7B_to_0xFF[value - 0x7B];
+
+ // convert by hand (avoiding a LUT with mostly 1:1 mapping)
+ switch(value) {
+ case 0x24:
+ return "\u0142";
+ case 0x5C:
+ return "\u016E";
+ case 0x5E:
+ return "\u0141";
+ case 0x60:
+ return "\u0104";
+ }
+
+ // leave untouched
+ return std::string((char*) &value, 1);
+}
+
+
+std::string CharsetTools::ConvertTextToUTF8(const uint8_t *data, size_t len, int charset, std::string* charset_name) {
+ // remove undesired chars
+ std::vector<uint8_t> cleaned_data;
+ for(size_t i = 0; i < len; i++) {
+ switch(data[i]) {
+ case 0x00: // NULL
+ case 0x0A: // PLB
+ case 0x0B: // EoH
+ case 0x1F: // PWB
+ continue;
+ default:
+ cleaned_data.push_back(data[i]);
+ }
+ }
+
+ // convert characters
+ if(charset == 0b0000) { // EBU Latin based
+ if(charset_name)
+ *charset_name = "EBU Latin based";
+
+ std::string result;
+ for(const uint8_t& c : cleaned_data)
+ result += ConvertCharEBUToUTF8(c);
+ return result;
+ }
+
+ if(charset == 0b1111) { // UTF-8
+ if(charset_name)
+ *charset_name = "UTF-8";
+
+ return std::string((char*) &cleaned_data[0], cleaned_data.size());
+ }
+
+ // ignore unsupported charset
+ return "";
+}
+
+
+size_t StringTools::UTF8CharsLen(const std::string &s, size_t chars) {
+ size_t result;
+ for(result = 0; result < s.size(); result++) {
+ // if not a continuation byte, handle counter
+ if((s[result] & 0xC0) != 0x80) {
+ if(chars == 0)
+ break;
+ chars--;
+ }
+ }
+ return result;
+}
+
+size_t StringTools::UTF8Len(const std::string &s) {
+ // ignore continuation bytes
+ return std::count_if(s.cbegin(), s.cend(), [](const char c){return (c & 0xC0) != 0x80;});
+}
+
+std::string StringTools::UTF8Substr(const std::string &s, size_t pos, size_t count) {
+ std::string result = s;
+ result.erase(0, UTF8CharsLen(result, pos));
+ result.erase(UTF8CharsLen(result, count));
+ return result;
+}