summaryrefslogtreecommitdiffstats
path: root/lib/charset
diff options
context:
space:
mode:
authorMatthias P. Braendli <matthias.braendli@mpb.li>2018-02-09 12:02:13 +0100
committerMatthias P. Braendli <matthias.braendli@mpb.li>2018-02-09 12:02:13 +0100
commit4897c53db48386fbbd2556e8be6d5a60e9d1a7ba (patch)
tree30acae2ffeb12fbf189eec72709edc4ffab6a83a /lib/charset
parent2a96b61d31cc51611731297f936a477663871b9d (diff)
downloaddabmux-4897c53db48386fbbd2556e8be6d5a60e9d1a7ba.tar.gz
dabmux-4897c53db48386fbbd2556e8be6d5a60e9d1a7ba.tar.bz2
dabmux-4897c53db48386fbbd2556e8be6d5a60e9d1a7ba.zip
Move charset implementation to cpp file
Diffstat (limited to 'lib/charset')
-rw-r--r--lib/charset/charset.cpp62
-rw-r--r--lib/charset/charset.h76
2 files changed, 70 insertions, 68 deletions
diff --git a/lib/charset/charset.cpp b/lib/charset/charset.cpp
index 5cc773d..1abc097 100644
--- a/lib/charset/charset.cpp
+++ b/lib/charset/charset.cpp
@@ -1,5 +1,5 @@
/*
- Copyright (C) 2015 Matthias P. Braendli (http://opendigitalradio.org)
+ Copyright (C) 2018 Matthias P. Braendli (http://opendigitalradio.org)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -16,13 +16,15 @@
*/
/*!
\file charset.cpp
- \brief Define the EBU charset according to ETSI TS 101 756v1.8.1 for DLS encoding
+ \brief A converter for UTF-8 to EBU Latin charset according to
+ ETSI TS 101 756 Annex C, used for DLS and Labels.
- \author Matthias P. Braendli <matthias@mpb.li>
+ \author Matthias P. Braendli
\author Lindsay Cornell
*/
#include "charset.h"
+#include <algorithm>
/**********************************************/
/************* BIG FAT WARNING ****************/
@@ -34,8 +36,10 @@
/********* END OF BIG FAT WARNING *************/
/**********************************************/
+#define CHARSET_TABLE_OFFSET 1 // NUL at index 0 cannot be represented
+#define CHARSET_TABLE_ENTRIES (256 - CHARSET_TABLE_OFFSET)
-const char* utf8_encoded_EBU_Latin[CHARSET_TABLE_ENTRIES] = {
+static const char* utf8_encoded_EBU_Latin[CHARSET_TABLE_ENTRIES] = {
"Ę", "Į", "Ų", "Ă", "Ė", "Ď", "Ș", "Ț", "Ċ", "\n","\v","Ġ", "Ĺ", "Ż", "Ń",
"ą", "ę", "į", "ų", "ă", "ė", "ď", "ș", "ț", "ċ", "Ň", "Ě", "ġ", "ĺ", "ż", "\u0082",
" ", "!", "\"","#", "ł", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/",
@@ -53,10 +57,58 @@ const char* utf8_encoded_EBU_Latin[CHARSET_TABLE_ENTRIES] = {
"Ã", "Å", "Æ", "Œ", "ŷ", "Ý", "Õ", "Ø", "Þ", "Ŋ", "Ŕ", "Ć", "Ś", "Ź", "Ť", "ð",
"ã", "å", "æ", "œ", "ŵ", "ý", "õ", "ø", "þ", "ŋ", "ŕ", "ć", "ś", "ź", "ť", "ħ"};
-std::string CharsetConverter::convert_ebu_to_utf8(const std::string& str)
+using namespace std;
+
+CharsetConverter::CharsetConverter()
{
+ /*! Build the converstion table that contains the known code points,
+ * at the indices corresponding to the EBU Latin table
+ */
using namespace std;
+ for (size_t i = 0; i < CHARSET_TABLE_ENTRIES; i++) {
+ string table_entry(utf8_encoded_EBU_Latin[i]);
+ string::iterator it = table_entry.begin();
+ uint32_t code_point = utf8::next(it, table_entry.end());
+ m_conversion_table.push_back(code_point);
+ }
+}
+
+std::string CharsetConverter::convert(std::string line_utf8, bool up_to_first_error)
+{
+ string::iterator end_it;
+
+ if (up_to_first_error) {
+ // check for invalid utf-8, we only convert up to the first error
+ end_it = utf8::find_invalid(line_utf8.begin(), line_utf8.end());
+ }
+ else {
+ end_it = line_utf8.end();
+ }
+
+ // Convert it to utf-32
+ vector<uint32_t> utf32line;
+ utf8::utf8to32(line_utf8.begin(), end_it, back_inserter(utf32line));
+ string encoded_line(utf32line.size(), '0');
+
+ // Try to convert each codepoint
+ for (size_t i = 0; i < utf32line.size(); i++) {
+ vector<uint32_t>::iterator iter = find(m_conversion_table.begin(),
+ m_conversion_table.end(), utf32line[i]);
+ if (iter != m_conversion_table.end()) {
+ size_t index = std::distance(m_conversion_table.begin(), iter);
+
+ encoded_line[i] = (char)(index + CHARSET_TABLE_OFFSET);
+ }
+ else {
+ encoded_line[i] = ' ';
+ }
+ }
+ return encoded_line;
+}
+
+std::string CharsetConverter::convert_ebu_to_utf8(const std::string& str)
+{
string utf8_str;
for (const uint8_t c : str) {
// Table offset because NUL is not represented
diff --git a/lib/charset/charset.h b/lib/charset/charset.h
index c6a3001..8476ee7 100644
--- a/lib/charset/charset.h
+++ b/lib/charset/charset.h
@@ -1,5 +1,5 @@
/*
- Copyright (C) 2015 Matthias P. Braendli (http://opendigitalradio.org)
+ Copyright (C) 2018 Matthias P. Braendli (http://opendigitalradio.org)
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -16,87 +16,37 @@
*/
/*!
\file charset.h
- \brief Define the EBU charset according to ETSI TS 101 756v1.8.1 for DLS encoding
+ \brief A converter for UTF-8 to EBU Latin charset according to
+ ETSI TS 101 756 Annex C, used for DLS and Labels.
- \author Matthias P. Braendli <matthias@mpb.li>
+ \author Matthias P. Braendli
\author Lindsay Cornell
*/
-#ifndef __CHARSET_H_
-#define __CHARSET_H_
+#pragma once
-#include "utf8.h"
#include <cstdint>
#include <string>
#include <vector>
-#include <algorithm>
-
-#define CHARSET_TABLE_OFFSET 1 // NUL at index 0 cannot be represented
-#define CHARSET_TABLE_ENTRIES (256 - CHARSET_TABLE_OFFSET)
-extern const char* utf8_encoded_EBU_Latin[CHARSET_TABLE_ENTRIES];
+#include "utf8.h"
class CharsetConverter
{
public:
- CharsetConverter() {
- /*! Build the converstion table that contains the known code points,
- * at the indices corresponding to the EBU Latin table
- */
- using namespace std;
- for (size_t i = 0; i < CHARSET_TABLE_ENTRIES; i++) {
- string table_entry(utf8_encoded_EBU_Latin[i]);
- string::iterator it = table_entry.begin();
- uint32_t code_point = utf8::next(it, table_entry.end());
- m_conversion_table.push_back(code_point);
- }
- }
+ CharsetConverter();
/*! Convert a UTF-8 encoded text line into an EBU Latin encoded byte
- * stream. If up_to_first_error is set, convert as much text as possible.
- * If false, raise an exception in case of conversion errors.
+ * stream. If up_to_first_error is set, convert as much text as possible.
+ * If false, raise an utf8::exception in case of conversion errors.
*/
- std::string convert(std::string line_utf8, bool up_to_first_error = true) {
- using namespace std;
-
- // check for invalid utf-8, we only convert up to the first error
- string::iterator end_it;
- if (up_to_first_error) {
- end_it = utf8::find_invalid(line_utf8.begin(), line_utf8.end());
- }
- else {
- end_it = line_utf8.end();
- }
-
- // Convert it to utf-32
- vector<uint32_t> utf32line;
- utf8::utf8to32(line_utf8.begin(), end_it, back_inserter(utf32line));
+ std::string convert(std::string line_utf8, bool up_to_first_error = true);
- string encoded_line(utf32line.size(), '0');
-
- // Try to convert each codepoint
- for (size_t i = 0; i < utf32line.size(); i++) {
- vector<uint32_t>::iterator iter = find(m_conversion_table.begin(),
- m_conversion_table.end(), utf32line[i]);
- if (iter != m_conversion_table.end()) {
- size_t index = std::distance(m_conversion_table.begin(), iter);
-
- encoded_line[i] = (char)(index + CHARSET_TABLE_OFFSET);
- }
- else {
- encoded_line[i] = ' ';
- }
- }
- return encoded_line;
- }
-
- /* Convert a EBU Latin byte stream to a UTF-8 encoded string.
- * Invalid input characters are converted to ⁇ (unicode U+2047).
+ /*! Convert a EBU Latin byte stream to a UTF-8 encoded string.
+ * Invalid input characters are converted to ⁇ (unicode U+2047).
*/
std::string convert_ebu_to_utf8(const std::string& str);
private:
+ // Representation of the table in 32-bit unicode
std::vector<uint32_t> m_conversion_table;
};
-
-
-#endif