diff options
author | Matthias P. Braendli <matthias.braendli@mpb.li> | 2018-02-09 12:02:50 +0100 |
---|---|---|
committer | Matthias P. Braendli <matthias.braendli@mpb.li> | 2018-02-09 12:02:50 +0100 |
commit | 2e2db961945de6f46ab0c31e9d5afe9d4a4e28ce (patch) | |
tree | ee9ca1a6cbb5f7d4f6a8c178125de24ad9312784 /src/charset.h | |
parent | 587bb4d04bfeed40ba744e8230dd5acdcad39bd6 (diff) | |
download | ODR-PadEnc-2e2db961945de6f46ab0c31e9d5afe9d4a4e28ce.tar.gz ODR-PadEnc-2e2db961945de6f46ab0c31e9d5afe9d4a4e28ce.tar.bz2 ODR-PadEnc-2e2db961945de6f46ab0c31e9d5afe9d4a4e28ce.zip |
Update charset converter
Diffstat (limited to 'src/charset.h')
-rw-r--r-- | src/charset.h | 72 |
1 files changed, 17 insertions, 55 deletions
diff --git a/src/charset.h b/src/charset.h index 3b5f102..8476ee7 100644 --- a/src/charset.h +++ b/src/charset.h @@ -1,5 +1,5 @@ /* - Copyright (C) 2015 Matthias P. Braendli (http://opendigitalradio.org) + Copyright (C) 2018 Matthias P. Braendli (http://opendigitalradio.org) This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -16,75 +16,37 @@ */ /*! \file charset.h - \brief Define the EBU charset according to ETSI TS 101 756v1.8.1 for DLS encoding + \brief A converter for UTF-8 to EBU Latin charset according to + ETSI TS 101 756 Annex C, used for DLS and Labels. - \author Matthias P. Braendli <matthias@mpb.li> + \author Matthias P. Braendli \author Lindsay Cornell */ -#ifndef __CHARSET_H_ -#define __CHARSET_H_ +#pragma once -#include "common.h" - -#include "utf8.h" +#include <cstdint> #include <string> #include <vector> -#include <algorithm> - -#define CHARSET_TABLE_OFFSET 1 // NUL at index 0 cannot be represented -#define CHARSET_TABLE_ENTRIES (256 - CHARSET_TABLE_OFFSET) -extern const char* utf8_encoded_EBU_Latin[CHARSET_TABLE_ENTRIES]; +#include "utf8.h" class CharsetConverter { public: - CharsetConverter() { - /*! Build the converstion table that contains the known code points, - * at the indices corresponding to the EBU Latin table - */ - using namespace std; - for (size_t i = 0; i < CHARSET_TABLE_ENTRIES; i++) { - string table_entry(utf8_encoded_EBU_Latin[i]); - string::iterator it = table_entry.begin(); - uint32_t code_point = utf8::next(it, table_entry.end()); - m_conversion_table.push_back(code_point); - } - } + CharsetConverter(); - /*! Convert a UTF-8 encoded text line into an EBU Latin encoded byte stream + /*! Convert a UTF-8 encoded text line into an EBU Latin encoded byte + * stream. If up_to_first_error is set, convert as much text as possible. + * If false, raise an utf8::exception in case of conversion errors. */ - std::string convert(std::string line_utf8) { - using namespace std; - - // check for invalid utf-8, we only convert up to the first error - string::iterator end_it = utf8::find_invalid(line_utf8.begin(), line_utf8.end()); - - // Convert it to utf-32 - vector<uint32_t> utf32line; - utf8::utf8to32(line_utf8.begin(), end_it, back_inserter(utf32line)); + std::string convert(std::string line_utf8, bool up_to_first_error = true); - string encoded_line(utf32line.size(), '0'); - - // Try to convert each codepoint - for (size_t i = 0; i < utf32line.size(); i++) { - vector<uint32_t>::iterator iter = find(m_conversion_table.begin(), - m_conversion_table.end(), utf32line[i]); - if (iter != m_conversion_table.end()) { - size_t index = std::distance(m_conversion_table.begin(), iter); - - encoded_line[i] = (char)(index + CHARSET_TABLE_OFFSET); - } - else { - encoded_line[i] = ' '; - } - } - return encoded_line; - } + /*! Convert a EBU Latin byte stream to a UTF-8 encoded string. + * Invalid input characters are converted to ⁇ (unicode U+2047). + */ + std::string convert_ebu_to_utf8(const std::string& str); private: - + // Representation of the table in 32-bit unicode std::vector<uint32_t> m_conversion_table; }; - -#endif |