From 9d3d404a46dd79ed2b7a6c39719c76839787127e Mon Sep 17 00:00:00 2001 From: "Matthias P. Braendli" Date: Wed, 16 Jan 2019 15:44:45 +0100 Subject: Add charset decoder for FIG1 labels --- src/charset.cpp | 78 ++++++++++++++++++++++++++++++++++++++++++++++++ src/charset.hpp | 35 ++++++++++++++++++++++ src/ensembledatabase.cpp | 62 ++++++++++++++++++++++++++++++++++---- src/ensembledatabase.hpp | 21 ++++++++----- src/etianalyse.cpp | 26 ++++------------ src/fig1.cpp | 59 +++++++++++++++++++++++++----------- src/fig2.cpp | 5 ++-- 7 files changed, 232 insertions(+), 54 deletions(-) create mode 100644 src/charset.cpp create mode 100644 src/charset.hpp (limited to 'src') diff --git a/src/charset.cpp b/src/charset.cpp new file mode 100644 index 0000000..5edb3df --- /dev/null +++ b/src/charset.cpp @@ -0,0 +1,78 @@ +/* + Copyright (C) 2018 Matthias P. Braendli (http://opendigitalradio.org) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ +/*! + \file charset.cpp + \brief A converter for UTF-8 to EBU Latin charset according to + ETSI TS 101 756 Annex C, used for DLS and Labels. + + \author Matthias P. Braendli + \author Lindsay Cornell +*/ + +#include "charset.hpp" +#include + +/**********************************************/ +/************* BIG FAT WARNING ****************/ +/**********************************************/ +/**** Make sure this file is always saved ****/ +/**** encoded in UTF-8, otherwise you will ****/ +/**** mess up the table below ! ****/ +/**********************************************/ +/********* END OF BIG FAT WARNING *************/ +/**********************************************/ + +#define CHARSET_TABLE_OFFSET 1 // NUL at index 0 cannot be represented +#define CHARSET_TABLE_ENTRIES (256 - CHARSET_TABLE_OFFSET) + +static const char* utf8_encoded_EBU_Latin[CHARSET_TABLE_ENTRIES] = { + "Ę", "Į", "Ų", "Ă", "Ė", "Ď", "Ș", "Ț", "Ċ", "\n","\v","Ġ", "Ĺ", "Ż", "Ń", +"ą", "ę", "į", "ų", "ă", "ė", "ď", "ș", "ț", "ċ", "Ň", "Ě", "ġ", "ĺ", "ż", "\u0082", +" ", "!", "\"","#", "ł", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", +"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ":", ";", "<", "=", ">", "?", +"@", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", +"P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "[", "Ů", "]", "Ł", "_", +"Ą", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", +"p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "«", "ů", "»", "Ľ", "Ħ", +"á", "à", "é", "è", "í", "ì", "ó", "ò", "ú", "ù", "Ñ", "Ç", "Ş", "ß", "¡", "Ÿ", +"â", "ä", "ê", "ë", "î", "ï", "ô", "ö", "û", "ü", "ñ", "ç", "ş", "ğ", "ı", "ÿ", +"Ķ", "Ņ", "©", "Ģ", "Ğ", "ě", "ň", "ő", "Ő", "€", "£", "$", "Ā", "Ē", "Ī", "Ū", +"ķ", "ņ", "Ļ", "ģ", "ļ", "İ", "ń", "ű", "Ű", "¿", "ľ", "°", "ā", "ē", "ī", "ū", +"Á", "À", "É", "È", "Í", "Ì", "Ó", "Ò", "Ú", "Ù", "Ř", "Č", "Š", "Ž", "Ð", "Ŀ", +"Â", "Ä", "Ê", "Ë", "Î", "Ï", "Ô", "Ö", "Û", "Ü", "ř", "č", "š", "ž", "đ", "ŀ", +"Ã", "Å", "Æ", "Œ", "ŷ", "Ý", "Õ", "Ø", "Þ", "Ŋ", "Ŕ", "Ć", "Ś", "Ź", "Ť", "ð", +"ã", "å", "æ", "œ", "ŵ", "ý", "õ", "ø", "þ", "ŋ", "ŕ", "ć", "ś", "ź", "ť", "ħ"}; + +using namespace std; + +std::string convert_ebu_to_utf8(const std::string& str) +{ + string utf8_str; + for (const uint8_t c : str) { + // Table offset because NUL is not represented + if (c >= CHARSET_TABLE_OFFSET) { + string utf8_char(utf8_encoded_EBU_Latin[c - CHARSET_TABLE_OFFSET]); + utf8_str += utf8_char; + } + else { + utf8_str += "⁇"; + } + } + + return utf8_str; +} + diff --git a/src/charset.hpp b/src/charset.hpp new file mode 100644 index 0000000..c87a35f --- /dev/null +++ b/src/charset.hpp @@ -0,0 +1,35 @@ +/* + Copyright (C) 2018 Matthias P. Braendli (http://opendigitalradio.org) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . +*/ +/*! + \file charset.h + \brief A converter for UTF-8 to EBU Latin charset according to + ETSI TS 101 756 Annex C, used for DLS and Labels. + + \author Matthias P. Braendli + \author Lindsay Cornell +*/ + +#pragma once + +#include +#include +#include + +/*! Convert a EBU Latin byte stream to a UTF-8 encoded string. + * Invalid input characters are converted to ⁇ (unicode U+2047). + */ +std::string convert_ebu_to_utf8(const std::string& str); diff --git a/src/ensembledatabase.cpp b/src/ensembledatabase.cpp index 1353c24..e5066a6 100644 --- a/src/ensembledatabase.cpp +++ b/src/ensembledatabase.cpp @@ -31,6 +31,7 @@ #include #include #include "ensembledatabase.hpp" +#include "charset.hpp" namespace ensemble_database { @@ -50,6 +51,37 @@ static string ucs2toutf8(const uint8_t *ucs2, size_t len_bytes) return ucsconv.to_bytes(ucs2label); } +std::string label_t::label() const +{ + switch (charset) { + case charset_e::COMPLETE_EBU_LATIN: + return convert_ebu_to_utf8(string(label_bytes.begin(), label_bytes.end())); + case charset_e::UTF8: + return string(label_bytes.begin(), label_bytes.end()); + case charset_e::UCS2: + try { + return ucs2toutf8(label_bytes.data(), label_bytes.size()); + } + catch (const range_error&) { + return ""; + } + case charset_e::UNDEFINED: + throw logic_error("charset undefined"); + } + throw logic_error("invalid charset " + to_string((int)charset)); +} + +std::string label_t::shortlabel() const +{ + string shortlabel; + for (size_t i = 0; i < label_bytes.size(); ++i) { + if (shortlabel_flag & 0x8000 >> i) { + shortlabel += static_cast(label_bytes[i]); + } + } + + return shortlabel; +} string label_t::assemble() const { @@ -64,18 +96,23 @@ string label_t::assemble() const } } - switch (charset) { - case extended_label_charset::UTF8: + switch (extended_label_charset) { + case charset_e::COMPLETE_EBU_LATIN: + // FIG2 doesn't allow EBU, use FIG1 for those + return ""; + case charset_e::UTF8: return string(segments_cat.begin(), segments_cat.end()); - case extended_label_charset::UCS2: + case charset_e::UCS2: try { return ucs2toutf8(segments_cat.data(), segments_cat.size()); } catch (const range_error&) { return ""; } + case charset_e::UNDEFINED: + return ""; } - throw logic_error("invalid charset"); + throw logic_error("invalid extended label charset " + to_string((int)extended_label_charset)); } string label_t::assembly_state() const @@ -86,7 +123,22 @@ string label_t::assembly_state() const ss << s.first << ","; } - ss << "count=" << segment_count << "]"; + ss << "count=" << segment_count << ","; + ss << "charset="; + switch (extended_label_charset) { + case charset_e::COMPLETE_EBU_LATIN: + throw logic_error("invalid extended label LATIN charset"); + case charset_e::UTF8: + ss << "UTF8"; + break; + case charset_e::UCS2: + ss << "UCS2"; + break; + case charset_e::UNDEFINED: + ss << "UNDEFINED"; + break; + } + ss << "]"; return ss.str(); } diff --git a/src/ensembledatabase.hpp b/src/ensembledatabase.hpp index cc080ec..dee434d 100644 --- a/src/ensembledatabase.hpp +++ b/src/ensembledatabase.hpp @@ -41,20 +41,27 @@ namespace ensemble_database { -enum class extended_label_charset { - UTF8, // encoding flag = 0 - UCS2, // encoding flag = 1 +enum class charset_e { + COMPLETE_EBU_LATIN = 0, + UTF8 = 15, // encoding flag = 0 + UCS2 = 6, // encoding flag = 1 + UNDEFINED, }; struct label_t { - // FIG 1 Label and shortlabel - std::string label; + // FIG 1 Label and shortlabel, in raw form + std::vector label_bytes; uint16_t shortlabel_flag; + charset_e charset = charset_e::COMPLETE_EBU_LATIN; + + // Returns a utf-8 encoded shortlabel + std::string shortlabel() const; + std::string label() const; // Extended Label from FIG 2 std::map > segments; size_t segment_count = 0; // number if actual segments (not segment count as in spec) - extended_label_charset charset; + charset_e extended_label_charset = charset_e::UNDEFINED; uint8_t toggle_flag = 0; // Assemble all segments into a UTF-8 string. Returns an @@ -92,8 +99,6 @@ struct component_t { bool primary; - label_t label; - /* TODO uint8_t type; diff --git a/src/etianalyse.cpp b/src/etianalyse.cpp index b419593..69f4cf7 100644 --- a/src/etianalyse.cpp +++ b/src/etianalyse.cpp @@ -91,19 +91,6 @@ static void print_fig_result(const fig_result_t& fig_result, const display_setti } } -static std::string flag_to_shortlabel(const ensemble_database::label_t label) -{ - stringstream shortlabel; - for (size_t i = 0; i < label.label.size(); ++i) { - if (label.shortlabel_flag & 0x8000 >> i) { - shortlabel << label.label[i]; - } - } - - return shortlabel.str(); -} - - void ETI_Analyser::analyse() { if (config.etifd != nullptr) { @@ -578,8 +565,8 @@ void ETI_Analyser::eti_analyse() fprintf(stat_fd, "---\n"); fprintf(stat_fd, "ensemble:\n"); fprintf(stat_fd, " id: 0x%x\n", ensemble.EId); - fprintf(stat_fd, " label: %s\n", ensemble.label.label.c_str()); - fprintf(stat_fd, " shortlabel: %s\n", flag_to_shortlabel(ensemble.label).c_str()); + fprintf(stat_fd, " label: %s\n", ensemble.label.label().c_str()); + fprintf(stat_fd, " shortlabel: %s\n", ensemble.label.shortlabel().c_str()); fprintf(stat_fd, "audio:\n"); for (const auto& snoop : config.streams_to_decode) { @@ -592,12 +579,9 @@ void ETI_Analyser::eti_analyse() corresponding_service_found = true; fprintf(stat_fd, " - service_id: 0x%x\n", service.id); fprintf(stat_fd, " subchannel_id: 0x%x\n", component.subchId); - fprintf(stat_fd, " label: %s\n", service.label.label.c_str()); - fprintf(stat_fd, " shortlabel: %s\n", flag_to_shortlabel(service.label).c_str()); - if (not component.label.label.empty()) { - fprintf(stat_fd, " component_label: %s\n", component.label.label.c_str()); - } - // TODO FIG2 labels + fprintf(stat_fd, " label: %s\n", service.label.label().c_str()); + fprintf(stat_fd, " shortlabel: %s\n", service.label.shortlabel().c_str()); + fprintf(stat_fd, " extended_label: %s\n", service.label.assemble().c_str()); try { const auto& subch = ensemble.get_subchannel(component.subchId); diff --git a/src/fig1.cpp b/src/fig1.cpp index 665544c..bcabf94 100644 --- a/src/fig1.cpp +++ b/src/fig1.cpp @@ -31,23 +31,37 @@ using namespace std; +static ensemble_database::charset_e charset_to_charset(uint8_t charset) +{ + using ensemble_database::charset_e; + if (charset == (uint8_t)charset_e::COMPLETE_EBU_LATIN) { + return charset_e::COMPLETE_EBU_LATIN; + } + else if (charset == (uint8_t)charset_e::UTF8) { + return charset_e::UTF8; + } + else if (charset == (uint8_t)charset_e::UCS2) { + return charset_e::UCS2; + } + else { + throw runtime_error("unsupported charset" + to_string(charset)); + } +} + // SHORT LABELS fig_result_t fig1_select(fig1_common_t& fig1, const display_settings_t &disp) { - uint16_t ext,charset; - uint16_t flag; - char label[17]; + vector label(16); fig_result_t r; uint8_t* f = fig1.f; - charset = (f[0] & 0xF0) >> 4; + uint8_t charset = (f[0] & 0xF0) >> 4; //oe = (f[0] & 0x08) >> 3; - ext = f[0] & 0x07; + uint16_t ext = f[0] & 0x07; r.msgs.push_back(strprintf("Charset=%d", charset)); - memcpy(label, f+fig1.figlen-18, 16); - label[16] = 0x00; - flag = f[fig1.figlen-2] * 256 + \ + memcpy(label.data(), f+fig1.figlen-18, 16); + uint16_t flag = f[fig1.figlen-2] * 256 + \ f[fig1.figlen-1]; switch (ext) { @@ -56,13 +70,16 @@ fig_result_t fig1_select(fig1_common_t& fig1, const display_settings_t &disp) uint16_t eid; eid = f[1] * 256 + f[2]; r.msgs.push_back(strprintf("Ensemble ID=0x%04X", eid)); - r.msgs.push_back(strprintf("Label=\"%s\"", label)); - r.msgs.push_back(strprintf("Short label mask=0x%04X", flag)); if (fig1.fibcrccorrect) { fig1.ensemble.EId = eid; - fig1.ensemble.label.label = label; + fig1.ensemble.label.label_bytes = label; fig1.ensemble.label.shortlabel_flag = flag; + fig1.ensemble.label.charset = charset_to_charset(charset); + + r.msgs.push_back(strprintf("Label=\"%s\"", fig1.ensemble.label.label().c_str())); + r.msgs.push_back(strprintf("Short label mask=0x%04X", flag)); + r.msgs.push_back(strprintf("Short label=%s", fig1.ensemble.label.shortlabel().c_str())); } } break; @@ -71,15 +88,18 @@ fig_result_t fig1_select(fig1_common_t& fig1, const display_settings_t &disp) { // ETSI EN 300 401 8.1.14.1 uint16_t sid; sid = f[1] * 256 + f[2]; - r.msgs.push_back(strprintf("Service ID=0x%04X", sid)); - r.msgs.push_back(strprintf("Label=\"%s\"", label)); - r.msgs.push_back(strprintf("Short label mask=0x%04X", flag)); if (fig1.fibcrccorrect) { try { auto& service = fig1.ensemble.get_service(sid); - service.label.label = label; + service.label.label_bytes = label; service.label.shortlabel_flag = flag; + service.label.charset = charset_to_charset(charset); + + r.msgs.push_back(strprintf("Service ID=0x%04X", sid)); + r.msgs.push_back(strprintf("Label=\"%s\"", service.label.label().c_str())); + r.msgs.push_back(strprintf("Short label mask=0x%04X", flag)); + r.msgs.push_back(strprintf("Short label=%s", service.label.shortlabel().c_str())); } catch (ensemble_database::not_found &e) { r.errors.push_back("Not yet in DB"); @@ -106,7 +126,8 @@ fig_result_t fig1_select(fig1_common_t& fig1, const display_settings_t &disp) } r.msgs.push_back(strprintf("Service ID=0x%04X", sid)); r.msgs.push_back(strprintf("Service Component ID=0x%04X", SCIdS)); - r.msgs.push_back(strprintf("Label=\"%s\"", label)); + // TODO put label into ensembledatabase + r.msgs.push_back(strprintf("Label bytes=\"%s\"", string(label.begin(), label.end()).c_str())); r.msgs.push_back(strprintf("Short label mask=0x%04X", flag)); } break; @@ -120,7 +141,8 @@ fig_result_t fig1_select(fig1_common_t& fig1, const display_settings_t &disp) f[4]; r.msgs.push_back(strprintf("Service ID=0x%04X", sid)); - r.msgs.push_back(strprintf("Label=\"%s\"", label)); + // TODO put label into ensembledatabase + r.msgs.push_back(strprintf("Label bytes=\"%s\"", string(label.begin(), label.end()).c_str())); r.msgs.push_back(strprintf("Short label mask=0x%04X", flag)); } break; @@ -161,7 +183,8 @@ fig_result_t fig1_select(fig1_common_t& fig1, const display_settings_t &disp) r.msgs.push_back(strprintf("Service ID=0x%04X", sid)); r.msgs.push_back(strprintf("Service Component ID=0x%04X", SCIdS)); r.msgs.push_back(strprintf("X-PAD App=%02X (", xpadapp) + xpadappdesc + ")"); - r.msgs.push_back(strprintf("Label=\"%s\"", label)); + // TODO put label into ensembledatabase + r.msgs.push_back(strprintf("Label bytes=\"%s\"", string(label.begin(), label.end()).c_str())); r.msgs.push_back(strprintf("Short label mask=0x%04X", flag)); } break; diff --git a/src/fig2.cpp b/src/fig2.cpp index 5c1227a..74243ce 100644 --- a/src/fig2.cpp +++ b/src/fig2.cpp @@ -40,6 +40,7 @@ static void handle_ext_label_data_field(fig2_common_t& fig2, ensemble_database:: if (label.toggle_flag != fig2.toggle_flag()) { label.segments.clear(); + label.extended_label_charset = ensemble_database::charset_e::UNDEFINED; label.toggle_flag = fig2.toggle_flag(); } @@ -55,10 +56,10 @@ static void handle_ext_label_data_field(fig2_common_t& fig2, ensemble_database:: r.msgs.push_back(strprintf("Total number of segments=%d", segment_count + 1)); if (encoding_flag) { - label.charset = ensemble_database::extended_label_charset::UCS2; + label.extended_label_charset = ensemble_database::charset_e::UCS2; } else { - label.charset = ensemble_database::extended_label_charset::UTF8; + label.extended_label_charset = ensemble_database::charset_e::UTF8; } if (fig2.rfu() == 0) { -- cgit v1.2.3