Add charset decoder for FIG1 labels

author: Matthias P. Braendli <matthias.braendli@mpb.li> 2019-01-16 15:44:45 +0100
committer: Matthias P. Braendli <matthias.braendli@mpb.li> 2019-01-16 15:44:45 +0100
commit: 9d3d404a46dd79ed2b7a6c39719c76839787127e (patch)
tree: 9f67996d2f72b0f93302d695299b173e50698244
parent: 1df947bae7346948e08edb75616d34fcf8802dae (diff)
download: etisnoop-9d3d404a46dd79ed2b7a6c39719c76839787127e.tar.gz
etisnoop-9d3d404a46dd79ed2b7a6c39719c76839787127e.tar.bz2
etisnoop-9d3d404a46dd79ed2b7a6c39719c76839787127e.zip
8 files changed, 233 insertions, 54 deletions
diff --git a/Makefile.am b/Makefile.am
index 311f123..2f0f9c8 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -14,6 +14,7 @@ etisnoop_SOURCES     = src/dabplussnoop.cpp src/dabplussnoop.hpp \
 					   src/etiinput.cpp src/etiinput.hpp \
 					   src/etianalyse.cpp src/etianalyse.hpp \
 					   src/etisnoop.cpp \
+					   src/charset.cpp src/charset.hpp \
 					   src/faad_decoder.cpp src/faad_decoder.hpp \
 					   src/ensembledatabase.hpp src/ensembledatabase.cpp \
 					   src/fig0_0.cpp \
diff --git a/src/charset.cpp b/src/charset.cpp
new file mode 100644
index 0000000..5edb3df
--- /dev/null
+++ b/src/charset.cpp
@@ -0,0 +1,78 @@
+/*
+    Copyright (C) 2018 Matthias P. Braendli (http://opendigitalradio.org)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/*!
+    \file charset.cpp
+    \brief A converter for UTF-8 to EBU Latin charset according to
+           ETSI TS 101 756 Annex C, used for DLS and Labels.
+
+    \author Matthias P. Braendli
+    \author Lindsay Cornell
+*/
+
+#include "charset.hpp"
+#include <algorithm>
+
+/**********************************************/
+/************* BIG FAT WARNING ****************/
+/**********************************************/
+/**** Make sure this file is always saved  ****/
+/**** encoded in UTF-8, otherwise you will ****/
+/****      mess up the table below !       ****/
+/**********************************************/
+/********* END OF BIG FAT WARNING *************/
+/**********************************************/
+
+#define CHARSET_TABLE_OFFSET 1 // NUL at index 0 cannot be represented
+#define CHARSET_TABLE_ENTRIES (256 - CHARSET_TABLE_OFFSET)
+
+static const char* utf8_encoded_EBU_Latin[CHARSET_TABLE_ENTRIES] = {
+     "Ę", "Į", "Ų", "Ă", "Ė", "Ď", "Ș", "Ț", "Ċ", "\n","\v","Ġ", "Ĺ", "Ż", "Ń",
+"ą", "ę", "į", "ų", "ă", "ė", "ď", "ș", "ț", "ċ", "Ň", "Ě", "ġ", "ĺ", "ż", "\u0082",
+" ", "!", "\"","#", "ł", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/",
+"0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ":", ";", "<", "=", ">", "?",
+"@", "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O",
+"P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", "[", "Ů", "]", "Ł", "_",
+"Ą", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o",
+"p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "«", "ů", "»", "Ľ", "Ħ",
+"á", "à", "é", "è", "í", "ì", "ó", "ò", "ú", "ù", "Ñ", "Ç", "Ş", "ß", "¡", "Ÿ",
+"â", "ä", "ê", "ë", "î", "ï", "ô", "ö", "û", "ü", "ñ", "ç", "ş", "ğ", "ı", "ÿ",
+"Ķ", "Ņ", "©", "Ģ", "Ğ", "ě", "ň", "ő", "Ő", "€", "£", "$", "Ā", "Ē", "Ī", "Ū",
+"ķ", "ņ", "Ļ", "ģ", "ļ", "İ", "ń", "ű", "Ű", "¿", "ľ", "°", "ā", "ē", "ī", "ū",
+"Á", "À", "É", "È", "Í", "Ì", "Ó", "Ò", "Ú", "Ù", "Ř", "Č", "Š", "Ž", "Ð", "Ŀ",
+"Â", "Ä", "Ê", "Ë", "Î", "Ï", "Ô", "Ö", "Û", "Ü", "ř", "č", "š", "ž", "đ", "ŀ",
+"Ã", "Å", "Æ", "Œ", "ŷ", "Ý", "Õ", "Ø", "Þ", "Ŋ", "Ŕ", "Ć", "Ś", "Ź", "Ť", "ð",
+"ã", "å", "æ", "œ", "ŵ", "ý", "õ", "ø", "þ", "ŋ", "ŕ", "ć", "ś", "ź", "ť", "ħ"};
+
+using namespace std;
+
+std::string convert_ebu_to_utf8(const std::string& str)
+{
+    string utf8_str;
+    for (const uint8_t c : str) {
+        // Table offset because NUL is not represented
+        if (c >= CHARSET_TABLE_OFFSET) {
+            string utf8_char(utf8_encoded_EBU_Latin[c - CHARSET_TABLE_OFFSET]);
+            utf8_str += utf8_char;
+        }
+        else {
+            utf8_str += "⁇";
+        }
+    }
+
+    return utf8_str;
+}
+
diff --git a/src/charset.hpp b/src/charset.hpp
new file mode 100644
index 0000000..c87a35f
--- /dev/null
+++ b/src/charset.hpp
@@ -0,0 +1,35 @@
+/*
+    Copyright (C) 2018 Matthias P. Braendli (http://opendigitalradio.org)
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+*/
+/*!
+    \file charset.h
+    \brief A converter for UTF-8 to EBU Latin charset according to
+           ETSI TS 101 756 Annex C, used for DLS and Labels.
+
+    \author Matthias P. Braendli
+    \author Lindsay Cornell
+*/
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+/*! Convert a EBU Latin byte stream to a UTF-8 encoded string.
+ *  Invalid input characters are converted to ⁇ (unicode U+2047).
+ */
+std::string convert_ebu_to_utf8(const std::string& str);
diff --git a/src/ensembledatabase.cpp b/src/ensembledatabase.cpp
index 1353c24..e5066a6 100644
--- a/src/ensembledatabase.cpp
+++ b/src/ensembledatabase.cpp
@@ -31,6 +31,7 @@
 #include <codecvt>
 #include <sstream>
 #include "ensembledatabase.hpp"
+#include "charset.hpp"
 
 namespace ensemble_database {
 
@@ -50,6 +51,37 @@ static string ucs2toutf8(const uint8_t *ucs2, size_t len_bytes)
     return ucsconv.to_bytes(ucs2label);
 }
 
+std::string label_t::label() const
+{
+    switch (charset) {
+        case charset_e::COMPLETE_EBU_LATIN:
+            return convert_ebu_to_utf8(string(label_bytes.begin(), label_bytes.end()));
+        case charset_e::UTF8:
+            return string(label_bytes.begin(), label_bytes.end());
+        case charset_e::UCS2:
+            try {
+                return ucs2toutf8(label_bytes.data(), label_bytes.size());
+            }
+            catch (const range_error&) {
+                return "";
+            }
+        case charset_e::UNDEFINED:
+            throw logic_error("charset undefined");
+    }
+    throw logic_error("invalid charset " + to_string((int)charset));
+}
+
+std::string label_t::shortlabel() const
+{
+    string shortlabel;
+    for (size_t i = 0; i < label_bytes.size(); ++i) {
+        if (shortlabel_flag & 0x8000 >> i) {
+            shortlabel += static_cast<char>(label_bytes[i]);
+        }
+    }
+
+    return shortlabel;
+}
 
 string label_t::assemble() const
 {
@@ -64,18 +96,23 @@ string label_t::assemble() const
         }
     }
 
-    switch (charset) {
-        case extended_label_charset::UTF8:
+    switch (extended_label_charset) {
+        case charset_e::COMPLETE_EBU_LATIN:
+            // FIG2 doesn't allow EBU, use FIG1 for those
+            return "";
+        case charset_e::UTF8:
             return string(segments_cat.begin(), segments_cat.end());
-        case extended_label_charset::UCS2:
+        case charset_e::UCS2:
             try {
                 return ucs2toutf8(segments_cat.data(), segments_cat.size());
             }
             catch (const range_error&) {
                 return "";
             }
+        case charset_e::UNDEFINED:
+            return "";
     }
-    throw logic_error("invalid charset");
+    throw logic_error("invalid extended label charset " + to_string((int)extended_label_charset));
 }
 
 string label_t::assembly_state() const
@@ -86,7 +123,22 @@ string label_t::assembly_state() const
         ss << s.first << ",";
     }
 
-    ss << "count=" << segment_count << "]";
+    ss << "count=" << segment_count << ",";
+    ss << "charset=";
+    switch (extended_label_charset) {
+        case charset_e::COMPLETE_EBU_LATIN:
+            throw logic_error("invalid extended label LATIN charset");
+        case charset_e::UTF8:
+            ss << "UTF8";
+            break;
+        case charset_e::UCS2:
+            ss << "UCS2";
+            break;
+        case charset_e::UNDEFINED:
+            ss << "UNDEFINED";
+            break;
+    }
+    ss << "]";
 
     return ss.str();
 }
diff --git a/src/ensembledatabase.hpp b/src/ensembledatabase.hpp
index cc080ec..dee434d 100644
--- a/src/ensembledatabase.hpp
+++ b/src/ensembledatabase.hpp
@@ -41,20 +41,27 @@
 
 namespace ensemble_database {
 
-enum class extended_label_charset {
-    UTF8, // encoding flag = 0
-    UCS2, // encoding flag = 1
+enum class charset_e {
+    COMPLETE_EBU_LATIN = 0,
+    UTF8 = 15, // encoding flag = 0
+    UCS2 = 6, // encoding flag = 1
+    UNDEFINED,
 };
 
 struct label_t {
-    // FIG 1 Label and shortlabel
-    std::string label;
+    // FIG 1 Label and shortlabel, in raw form
+    std::vector<uint8_t> label_bytes;
     uint16_t shortlabel_flag;
+    charset_e charset = charset_e::COMPLETE_EBU_LATIN;
+
+    // Returns a utf-8 encoded shortlabel
+    std::string shortlabel() const;
+    std::string label() const;
 
     // Extended Label from FIG 2
     std::map<int, std::vector<uint8_t> > segments;
     size_t segment_count = 0; // number if actual segments (not segment count as in spec)
-    extended_label_charset charset;
+    charset_e extended_label_charset = charset_e::UNDEFINED;
     uint8_t toggle_flag = 0;
 
     // Assemble all segments into a UTF-8 string. Returns an
@@ -92,8 +99,6 @@ struct component_t {
 
     bool primary;
 
-    label_t label;
-
     /* TODO
     uint8_t type;
 
diff --git a/src/etianalyse.cpp b/src/etianalyse.cpp
index b419593..69f4cf7 100644
--- a/src/etianalyse.cpp
+++ b/src/etianalyse.cpp
@@ -91,19 +91,6 @@ static void print_fig_result(const fig_result_t& fig_result, const display_setti
     }
 }
 
-static std::string flag_to_shortlabel(const ensemble_database::label_t label)
-{
-    stringstream shortlabel;
-    for (size_t i = 0; i < label.label.size(); ++i) {
-        if (label.shortlabel_flag & 0x8000 >> i) {
-            shortlabel << label.label[i];
-        }
-    }
-
-    return shortlabel.str();
-}
-
-
 void ETI_Analyser::analyse()
 {
     if (config.etifd != nullptr) {
@@ -578,8 +565,8 @@ void ETI_Analyser::eti_analyse()
         fprintf(stat_fd, "---\n");
         fprintf(stat_fd, "ensemble:\n");
         fprintf(stat_fd, "    id: 0x%x\n", ensemble.EId);
-        fprintf(stat_fd, "    label: %s\n", ensemble.label.label.c_str());
-        fprintf(stat_fd, "    shortlabel: %s\n", flag_to_shortlabel(ensemble.label).c_str());
+        fprintf(stat_fd, "    label: %s\n", ensemble.label.label().c_str());
+        fprintf(stat_fd, "    shortlabel: %s\n", ensemble.label.shortlabel().c_str());
         fprintf(stat_fd, "audio:\n");
 
         for (const auto& snoop : config.streams_to_decode) {
@@ -592,12 +579,9 @@ void ETI_Analyser::eti_analyse()
                         corresponding_service_found = true;
                         fprintf(stat_fd, "    - service_id: 0x%x\n", service.id);
                         fprintf(stat_fd, "      subchannel_id: 0x%x\n", component.subchId);
-                        fprintf(stat_fd, "      label: %s\n", service.label.label.c_str());
-                        fprintf(stat_fd, "      shortlabel: %s\n", flag_to_shortlabel(service.label).c_str());
-                        if (not component.label.label.empty()) {
-                            fprintf(stat_fd, "      component_label: %s\n", component.label.label.c_str());
-                        }
-                        // TODO FIG2 labels
+                        fprintf(stat_fd, "      label: %s\n", service.label.label().c_str());
+                        fprintf(stat_fd, "      shortlabel: %s\n", service.label.shortlabel().c_str());
+                        fprintf(stat_fd, "      extended_label: %s\n", service.label.assemble().c_str());
 
                         try {
                             const auto& subch = ensemble.get_subchannel(component.subchId);
diff --git a/src/fig1.cpp b/src/fig1.cpp
index 665544c..bcabf94 100644
--- a/src/fig1.cpp
+++ b/src/fig1.cpp
@@ -31,23 +31,37 @@
 
 using namespace std;
 
+static ensemble_database::charset_e charset_to_charset(uint8_t charset)
+{
+    using ensemble_database::charset_e;
+    if (charset == (uint8_t)charset_e::COMPLETE_EBU_LATIN) {
+        return charset_e::COMPLETE_EBU_LATIN;
+    }
+    else if (charset == (uint8_t)charset_e::UTF8) {
+        return charset_e::UTF8;
+    }
+    else if (charset == (uint8_t)charset_e::UCS2) {
+        return charset_e::UCS2;
+    }
+    else {
+        throw runtime_error("unsupported charset" + to_string(charset));
+    }
+}
+
 // SHORT LABELS
 fig_result_t fig1_select(fig1_common_t& fig1, const display_settings_t &disp)
 {
-    uint16_t ext,charset;
-    uint16_t flag;
-    char label[17];
+    vector<uint8_t> label(16);
     fig_result_t r;
     uint8_t* f = fig1.f;
 
-    charset = (f[0] & 0xF0) >> 4;
+    uint8_t charset = (f[0] & 0xF0) >> 4;
     //oe = (f[0] & 0x08) >> 3;
-    ext = f[0] & 0x07;
+    uint16_t ext = f[0] & 0x07;
     r.msgs.push_back(strprintf("Charset=%d", charset));
 
-    memcpy(label, f+fig1.figlen-18, 16);
-    label[16] = 0x00;
-    flag = f[fig1.figlen-2] * 256 + \
+    memcpy(label.data(), f+fig1.figlen-18, 16);
+    uint16_t flag = f[fig1.figlen-2] * 256 + \
            f[fig1.figlen-1];
 
     switch (ext) {
@@ -56,13 +70,16 @@ fig_result_t fig1_select(fig1_common_t& fig1, const display_settings_t &disp)
                 uint16_t eid;
                 eid = f[1] * 256 + f[2];
                 r.msgs.push_back(strprintf("Ensemble ID=0x%04X", eid));
-                r.msgs.push_back(strprintf("Label=\"%s\"", label));
-                r.msgs.push_back(strprintf("Short label mask=0x%04X", flag));
 
                 if (fig1.fibcrccorrect) {
                     fig1.ensemble.EId = eid;
-                    fig1.ensemble.label.label = label;
+                    fig1.ensemble.label.label_bytes = label;
                     fig1.ensemble.label.shortlabel_flag = flag;
+                    fig1.ensemble.label.charset = charset_to_charset(charset);
+
+                    r.msgs.push_back(strprintf("Label=\"%s\"", fig1.ensemble.label.label().c_str()));
+                    r.msgs.push_back(strprintf("Short label mask=0x%04X", flag));
+                    r.msgs.push_back(strprintf("Short label=%s", fig1.ensemble.label.shortlabel().c_str()));
                 }
             }
             break;
@@ -71,15 +88,18 @@ fig_result_t fig1_select(fig1_common_t& fig1, const display_settings_t &disp)
             {   // ETSI EN 300 401 8.1.14.1
                 uint16_t sid;
                 sid = f[1] * 256 + f[2];
-                r.msgs.push_back(strprintf("Service ID=0x%04X", sid));
-                r.msgs.push_back(strprintf("Label=\"%s\"", label));
-                r.msgs.push_back(strprintf("Short label mask=0x%04X", flag));
 
                 if (fig1.fibcrccorrect) {
                     try {
                         auto& service = fig1.ensemble.get_service(sid);
-                        service.label.label = label;
+                        service.label.label_bytes = label;
                         service.label.shortlabel_flag = flag;
+                        service.label.charset = charset_to_charset(charset);
+
+                        r.msgs.push_back(strprintf("Service ID=0x%04X", sid));
+                        r.msgs.push_back(strprintf("Label=\"%s\"", service.label.label().c_str()));
+                        r.msgs.push_back(strprintf("Short label mask=0x%04X", flag));
+                        r.msgs.push_back(strprintf("Short label=%s", service.label.shortlabel().c_str()));
                     }
                     catch (ensemble_database::not_found &e) {
                         r.errors.push_back("Not yet in DB");
@@ -106,7 +126,8 @@ fig_result_t fig1_select(fig1_common_t& fig1, const display_settings_t &disp)
                 }
                 r.msgs.push_back(strprintf("Service ID=0x%04X", sid));
                 r.msgs.push_back(strprintf("Service Component ID=0x%04X", SCIdS));
-                r.msgs.push_back(strprintf("Label=\"%s\"", label));
+                // TODO put label into ensembledatabase
+                r.msgs.push_back(strprintf("Label bytes=\"%s\"", string(label.begin(), label.end()).c_str()));
                 r.msgs.push_back(strprintf("Short label mask=0x%04X", flag));
             }
             break;
@@ -120,7 +141,8 @@ fig_result_t fig1_select(fig1_common_t& fig1, const display_settings_t &disp)
                       f[4];
 
                 r.msgs.push_back(strprintf("Service ID=0x%04X", sid));
-                r.msgs.push_back(strprintf("Label=\"%s\"", label));
+                // TODO put label into ensembledatabase
+                r.msgs.push_back(strprintf("Label bytes=\"%s\"", string(label.begin(), label.end()).c_str()));
                 r.msgs.push_back(strprintf("Short label mask=0x%04X", flag));
             }
             break;
@@ -161,7 +183,8 @@ fig_result_t fig1_select(fig1_common_t& fig1, const display_settings_t &disp)
                 r.msgs.push_back(strprintf("Service ID=0x%04X", sid));
                 r.msgs.push_back(strprintf("Service Component ID=0x%04X", SCIdS));
                 r.msgs.push_back(strprintf("X-PAD App=%02X (", xpadapp) + xpadappdesc + ")");
-                r.msgs.push_back(strprintf("Label=\"%s\"", label));
+                // TODO put label into ensembledatabase
+                r.msgs.push_back(strprintf("Label bytes=\"%s\"", string(label.begin(), label.end()).c_str()));
                 r.msgs.push_back(strprintf("Short label mask=0x%04X", flag));
             }
             break;
diff --git a/src/fig2.cpp b/src/fig2.cpp
index 5c1227a..74243ce 100644
--- a/src/fig2.cpp
+++ b/src/fig2.cpp
@@ -40,6 +40,7 @@ static void handle_ext_label_data_field(fig2_common_t& fig2, ensemble_database::
 
     if (label.toggle_flag != fig2.toggle_flag()) {
         label.segments.clear();
+        label.extended_label_charset = ensemble_database::charset_e::UNDEFINED;
         label.toggle_flag = fig2.toggle_flag();
     }
 
@@ -55,10 +56,10 @@ static void handle_ext_label_data_field(fig2_common_t& fig2, ensemble_database::
         r.msgs.push_back(strprintf("Total number of segments=%d", segment_count + 1));
 
         if (encoding_flag) {
-            label.charset = ensemble_database::extended_label_charset::UCS2;
+            label.extended_label_charset = ensemble_database::charset_e::UCS2;
         }
         else {
-            label.charset = ensemble_database::extended_label_charset::UTF8;
+            label.extended_label_charset = ensemble_database::charset_e::UTF8;
         }
 
         if (fig2.rfu() == 0) {
author	Matthias P. Braendli <matthias.braendli@mpb.li>	2019-01-16 15:44:45 +0100
committer	Matthias P. Braendli <matthias.braendli@mpb.li>	2019-01-16 15:44:45 +0100
commit	9d3d404a46dd79ed2b7a6c39719c76839787127e (patch)
tree	9f67996d2f72b0f93302d695299b173e50698244
parent	1df947bae7346948e08edb75616d34fcf8802dae (diff)
download	etisnoop-9d3d404a46dd79ed2b7a6c39719c76839787127e.tar.gz etisnoop-9d3d404a46dd79ed2b7a6c39719c76839787127e.tar.bz2 etisnoop-9d3d404a46dd79ed2b7a6c39719c76839787127e.zip