Update charset converter

author: Matthias P. Braendli <matthias.braendli@mpb.li> 2018-02-09 12:02:50 +0100
committer: Matthias P. Braendli <matthias.braendli@mpb.li> 2018-02-09 12:02:50 +0100
commit: 2e2db961945de6f46ab0c31e9d5afe9d4a4e28ce (patch)
tree: ee9ca1a6cbb5f7d4f6a8c178125de24ad9312784
parent: 587bb4d04bfeed40ba744e8230dd5acdcad39bd6 (diff)
download: ODR-PadEnc-2e2db961945de6f46ab0c31e9d5afe9d4a4e28ce.tar.gz
ODR-PadEnc-2e2db961945de6f46ab0c31e9d5afe9d4a4e28ce.tar.bz2
ODR-PadEnc-2e2db961945de6f46ab0c31e9d5afe9d4a4e28ce.zip
2 files changed, 93 insertions, 59 deletions
diff --git a/src/charset.cpp b/src/charset.cpp
index 2ee14f8..1abc097 100644
--- a/src/charset.cpp
+++ b/src/charset.cpp
@@ -1,5 +1,5 @@
 /*
-    Copyright (C) 2015 Matthias P. Braendli (http://opendigitalradio.org)
+    Copyright (C) 2018 Matthias P. Braendli (http://opendigitalradio.org)
 
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -16,13 +16,15 @@
 */
 /*!
     \file charset.cpp
-    \brief Define the EBU charset according to ETSI TS 101 756v1.8.1 for DLS encoding
+    \brief A converter for UTF-8 to EBU Latin charset according to
+           ETSI TS 101 756 Annex C, used for DLS and Labels.
 
-    \author Matthias P. Braendli <matthias@mpb.li>
+    \author Matthias P. Braendli
     \author Lindsay Cornell
 */
 
 #include "charset.h"
+#include <algorithm>
 
 /**********************************************/
 /************* BIG FAT WARNING ****************/
@@ -34,8 +36,10 @@
 /********* END OF BIG FAT WARNING *************/
 /**********************************************/
 
+#define CHARSET_TABLE_OFFSET 1 // NUL at index 0 cannot be represented
+#define CHARSET_TABLE_ENTRIES (256 - CHARSET_TABLE_OFFSET)
 
-const char* utf8_encoded_EBU_Latin[CHARSET_TABLE_ENTRIES] = {
+static const char* utf8_encoded_EBU_Latin[CHARSET_TABLE_ENTRIES] = {
      "Ę", "Į", "Ų", "Ă", "Ė", "Ď", "Ș", "Ț", "Ċ", "\n","\v","Ġ", "Ĺ", "Ż", "Ń",
 "ą", "ę", "į", "ų", "ă", "ė", "ď", "ș", "ț", "ċ", "Ň", "Ě", "ġ", "ĺ", "ż", "\u0082",
 " ", "!", "\"","#", "ł", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/",
@@ -52,3 +56,71 @@ const char* utf8_encoded_EBU_Latin[CHARSET_TABLE_ENTRIES] = {
 "Â", "Ä", "Ê", "Ë", "Î", "Ï", "Ô", "Ö", "Û", "Ü", "ř", "č", "š", "ž", "đ", "ŀ",
 "Ã", "Å", "Æ", "Œ", "ŷ", "Ý", "Õ", "Ø", "Þ", "Ŋ", "Ŕ", "Ć", "Ś", "Ź", "Ť", "ð",
 "ã", "å", "æ", "œ", "ŵ", "ý", "õ", "ø", "þ", "ŋ", "ŕ", "ć", "ś", "ź", "ť", "ħ"};
+
+using namespace std;
+
+CharsetConverter::CharsetConverter()
+{
+    /*! Build the converstion table that contains the known code points,
+     * at the indices corresponding to the EBU Latin table
+     */
+    using namespace std;
+    for (size_t i = 0; i < CHARSET_TABLE_ENTRIES; i++) {
+        string table_entry(utf8_encoded_EBU_Latin[i]);
+        string::iterator it = table_entry.begin();
+        uint32_t code_point = utf8::next(it, table_entry.end());
+        m_conversion_table.push_back(code_point);
+    }
+}
+
+std::string CharsetConverter::convert(std::string line_utf8, bool up_to_first_error)
+{
+    string::iterator end_it;
+
+    if (up_to_first_error) {
+        // check for invalid utf-8, we only convert up to the first error
+        end_it = utf8::find_invalid(line_utf8.begin(), line_utf8.end());
+    }
+    else {
+        end_it = line_utf8.end();
+    }
+
+    // Convert it to utf-32
+    vector<uint32_t> utf32line;
+    utf8::utf8to32(line_utf8.begin(), end_it, back_inserter(utf32line));
+
+    string encoded_line(utf32line.size(), '0');
+
+    // Try to convert each codepoint
+    for (size_t i = 0; i < utf32line.size(); i++) {
+        vector<uint32_t>::iterator iter = find(m_conversion_table.begin(),
+                m_conversion_table.end(), utf32line[i]);
+        if (iter != m_conversion_table.end()) {
+            size_t index = std::distance(m_conversion_table.begin(), iter);
+
+            encoded_line[i] = (char)(index + CHARSET_TABLE_OFFSET);
+        }
+        else {
+            encoded_line[i] = ' ';
+        }
+    }
+    return encoded_line;
+}
+
+std::string CharsetConverter::convert_ebu_to_utf8(const std::string& str)
+{
+    string utf8_str;
+    for (const uint8_t c : str) {
+        // Table offset because NUL is not represented
+        if (c >= CHARSET_TABLE_OFFSET) {
+            string utf8_char(utf8_encoded_EBU_Latin[c - CHARSET_TABLE_OFFSET]);
+            utf8_str += utf8_char;
+        }
+        else {
+            utf8_str += "⁇";
+        }
+    }
+
+    return utf8_str;
+}
+
diff --git a/src/charset.h b/src/charset.h
index 3b5f102..8476ee7 100644
--- a/src/charset.h
+++ b/src/charset.h
@@ -1,5 +1,5 @@
 /*
-    Copyright (C) 2015 Matthias P. Braendli (http://opendigitalradio.org)
+    Copyright (C) 2018 Matthias P. Braendli (http://opendigitalradio.org)
 
     This program is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -16,75 +16,37 @@
 */
 /*!
     \file charset.h
-    \brief Define the EBU charset according to ETSI TS 101 756v1.8.1 for DLS encoding
+    \brief A converter for UTF-8 to EBU Latin charset according to
+           ETSI TS 101 756 Annex C, used for DLS and Labels.
 
-    \author Matthias P. Braendli <matthias@mpb.li>
+    \author Matthias P. Braendli
     \author Lindsay Cornell
 */
 
-#ifndef __CHARSET_H_
-#define __CHARSET_H_
+#pragma once
 
-#include "common.h"
-
-#include "utf8.h"
+#include <cstdint>
 #include <string>
 #include <vector>
-#include <algorithm>
-
-#define CHARSET_TABLE_OFFSET 1 // NUL at index 0 cannot be represented
-#define CHARSET_TABLE_ENTRIES (256 - CHARSET_TABLE_OFFSET)
-extern const char* utf8_encoded_EBU_Latin[CHARSET_TABLE_ENTRIES];
+#include "utf8.h"
 
 class CharsetConverter
 {
     public:
-        CharsetConverter() {
-            /*! Build the converstion table that contains the known code points,
-             * at the indices corresponding to the EBU Latin table
-             */
-            using namespace std;
-            for (size_t i = 0; i < CHARSET_TABLE_ENTRIES; i++) {
-                string table_entry(utf8_encoded_EBU_Latin[i]);
-                string::iterator it = table_entry.begin();
-                uint32_t code_point = utf8::next(it, table_entry.end());
-                m_conversion_table.push_back(code_point);
-            }
-        }
+        CharsetConverter();
 
-        /*! Convert a UTF-8 encoded text line into an EBU Latin encoded byte stream
+        /*! Convert a UTF-8 encoded text line into an EBU Latin encoded byte
+         *  stream. If up_to_first_error is set, convert as much text as possible.
+         *  If false, raise an utf8::exception in case of conversion errors.
          */
-        std::string convert(std::string line_utf8) {
-            using namespace std;
-
-            // check for invalid utf-8, we only convert up to the first error
-            string::iterator end_it = utf8::find_invalid(line_utf8.begin(), line_utf8.end());
-
-            // Convert it to utf-32
-            vector<uint32_t> utf32line;
-            utf8::utf8to32(line_utf8.begin(), end_it, back_inserter(utf32line));
+        std::string convert(std::string line_utf8, bool up_to_first_error = true);
 
-            string encoded_line(utf32line.size(), '0');
-
-            // Try to convert each codepoint
-            for (size_t i = 0; i < utf32line.size(); i++) {
-                vector<uint32_t>::iterator iter = find(m_conversion_table.begin(),
-                        m_conversion_table.end(), utf32line[i]);
-                if (iter != m_conversion_table.end()) {
-                    size_t index = std::distance(m_conversion_table.begin(), iter);
-
-                    encoded_line[i] = (char)(index + CHARSET_TABLE_OFFSET);
-                }
-                else {
-                    encoded_line[i] = ' ';
-                }
-            }
-            return encoded_line;
-        }
+        /*! Convert a EBU Latin byte stream to a UTF-8 encoded string.
+         *  Invalid input characters are converted to ⁇ (unicode U+2047).
+         */
+        std::string convert_ebu_to_utf8(const std::string& str);
 
     private:
-
+        // Representation of the table in 32-bit unicode
         std::vector<uint32_t> m_conversion_table;
 };
-
-#endif
author	Matthias P. Braendli <matthias.braendli@mpb.li>	2018-02-09 12:02:50 +0100
committer	Matthias P. Braendli <matthias.braendli@mpb.li>	2018-02-09 12:02:50 +0100
commit	2e2db961945de6f46ab0c31e9d5afe9d4a4e28ce (patch)
tree	ee9ca1a6cbb5f7d4f6a8c178125de24ad9312784
parent	587bb4d04bfeed40ba744e8230dd5acdcad39bd6 (diff)
download	ODR-PadEnc-2e2db961945de6f46ab0c31e9d5afe9d4a4e28ce.tar.gz ODR-PadEnc-2e2db961945de6f46ab0c31e9d5afe9d4a4e28ce.tar.bz2 ODR-PadEnc-2e2db961945de6f46ab0c31e9d5afe9d4a4e28ce.zip