3 Answers
13
Update: The functions listed here are maintained in a GitHub repo, .hpp, .cpp and tests. Some UTF-16 functions have been disable because they do not work correctly. The “banana” tests in the utf.test.cpp file demonstrate the problem.
Also included a “read_with_bom” function for recognizing byte order marks.
#if _MSC_VER == 1900 //work around for bug in MS Visual C++ 2015 https://social.msdn.microsoft.com/Forums/en-US/8f40dcd8-c67f-4eba-9134-a19b9178e481/vs-2015-rc-linker-stdcodecvt-error?forum=vcgeneral
std::string to_utf8(const std::u16string &s)
{
std::wstring_convert<std::codecvt_utf8<int16_t>, int16_t> convert;
auto p = reinterpret_cast<const int16_t *>(s.data());
return convert.to_bytes(p, p + s.size());
}
std::string to_utf8(const std::u32string &s)
{
std::wstring_convert<std::codecvt_utf8<int32_t>, int32_t> convert;
auto p = reinterpret_cast<const int32_t *>(s.data());
return convert.to_bytes(p, p + s.size());
}
std::u16string to_utf16(const std::string &s)
{
std::wstring_convert<std::codecvt_utf8<int16_t>, int16_t> convert;
auto asInt = convert.from_bytes(s);
return std::u16string(reinterpret_cast<char16_t const *>(asInt.data()), asInt.length());
}
std::u32string to_utf32(const std::string &s)
{
std::wstring_convert<std::codecvt_utf8<int32_t>, int32_t> convert;
auto asInt = convert.from_bytes(s);
return std::u32string(reinterpret_cast<char32_t const *>(asInt.data()), asInt.length());
}
#else
std::string to_utf8(const std::u16string &s)
{
std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> conv;
return conv.to_bytes(s);
}
std::string to_utf8(const std::u32string &s)
{
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
return conv.to_bytes(s);
}
std::u16string to_utf16(const std::string &s)
{
std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> convert;
return convert.from_bytes(s);
}
std::u32string to_utf32(const std::string &s)
{
std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> conv;
return conv.from_bytes(s);
}
#endif
std::u16string to_utf16(const std::u32string &s)
{
return to_utf16(to_utf8(s));
}
std::u32string to_utf32(const std::u16string &s) {
return to_utf32(to_utf8(s));
}
std::u32string read_with_bom(std::istream & src)
{
enum encoding {
encoding_utf32be = 0,
encoding_utf32le,
encoding_utf16be,
encoding_utf16le,
encoding_utf8,
encoding_ascii,
};
std::vector<std::string> boms = {
std::string("\x00\x00\xFE\xFF", 4),
std::string("\xFF\xFE\x00\x00", 4),
std::string("\xFE\xFF", 2),
std::string("\xFF\xFE", 2),
std::string("\xEF\xBB\xBF", 3)
};
std::string buffer((std::istreambuf_iterator<char>(src)), std::istreambuf_iterator<char>());
encoding enc = encoding_ascii;
for (unsigned int i = 0; i < boms.size(); ++i) {
std::string testBom = boms[i];
if (buffer.compare(0, testBom.length(), testBom) == 0) {
enc = encoding(i);
buffer = buffer.substr(testBom.length());
break;
}
}
switch (enc) {
case encoding_utf32be:
{
if (buffer.length() % 4 != 0) {
throw std::logic_error("size in bytes must be a multiple of 4");
}
int count = buffer.length() / 4;
std::u32string temp = std::u32string(count, 0);
for (int i = 0; i < count; ++i) {
temp[i] = static_cast<char32_t>(buffer[i * 4 + 3] << 0 | buffer[i * 4 + 2] << 8 | buffer[i * 4 + 1] << 16 | buffer[i * 4 + 0] << 24);
}
return temp;
}
case encoding_utf32le:
{
if (buffer.length() % 4 != 0) {
throw std::logic_error("size in bytes must be a multiple of 4");
}
int count = buffer.length() / 4;
std::u32string temp = std::u32string(count, 0);
for (int i = 0; i < count; ++i) {
temp[i] = static_cast<char32_t>(buffer[i * 4 + 0] << 0 | buffer[i * 4 + 1] << 8 | buffer[i * 4 + 2] << 16 | buffer[i * 4 + 3] << 24);
}
return temp;
}
case encoding_utf16be:
{
if (buffer.length() % 2 != 0) {
throw std::logic_error("size in bytes must be a multiple of 2");
}
int count = buffer.length() / 2;
std::u16string temp = std::u16string(count, 0);
for (int i = 0; i < count; ++i) {
temp[i] = static_cast<char16_t>(buffer[i * 2 + 1] << 0 | buffer[i * 2 + 0] << 8);
}
return to_utf32(temp);
}
case encoding_utf16le:
{
if (buffer.length() % 2 != 0) {
throw std::logic_error("size in bytes must be a multiple of 2");
}
int count = buffer.length() / 2;
std::u16string temp = std::u16string(count, 0);
for (int i = 0; i < count; ++i) {
temp[i] = static_cast<char16_t>(buffer[i * 2 + 0] << 0 | buffer[i * 2 + 1] << 8);
}
return to_utf32(temp);
}
default:
return to_utf32(buffer);
}
}
answered Jul 31 ’16 at 21:10
BrentBrent
3,50732357
8
-
2
Consider posting this to codereview.stackexchange.com for code review if everything works properly.
– user2296177
Jul 31 ’16 at 21:19
-
I came to stack overflow to ask this question, and found there was no answer. I think I’ve used the site correctly.
– Brent
Jul 31 ’16 at 21:20
-
2
I’m not saying you’ve used stack overflow incorrectly; I think this is a good question to ask on code review.
– user2296177
Jul 31 ’16 at 21:21
-
Why does your UTF-16->UTF-8 use
uint16_t
instead ofchar16_t
? Same goes forchar32_t
and UTF-32. Is that required somewhere?– Nicol Bolas
Aug 1 ’16 at 1:52
-
1
error C4996: 'std::codecvt_utf8<char32_t,1114111,0>': warning STL4017: std::wbuffer_convert, std::wstring_convert, and the <codecvt> header (containing std::codecvt_mode, std::codecvt_utf8, std::codecvt_utf16, and std::codecvt_utf8_utf16) are deprecated in C++17. (The std::codecvt class template is NOT deprecated.) The C++ Standard doesn't provide equivalent non-deprecated functionality; consider using MultiByteToWideChar() and WideCharToMultiByte() from <Windows.h> instead. ...
C++17 in 2019 might require an update to this answer.– BitTickler
Aug 10 ’19 at 0:31
3
I’ve written a little utf_ranges library for doing just this. It uses Range-V3 and C++14.
It has both views and actions (if you’re familiar with Range-V3 terminology) for converting between any of the three main UTF encodings, can consume and generate byte order marks, and perform endian conversion based on a bom. For example, reading a file from unknown-endian UTF-16 into a UTF-8 std::string
, converting any of the seven unicode line endings to \n
, looks like this:
std::ifstream source{path, std::ios::binary};
std::string str = utf::istreambuf<char16_t>(source)
| utf::view::consume_bom
| utf::view::utf8
| utf::view::line_end_convert;
answered Aug 6 ’16 at 1:40
Tristan BrindleTristan Brindle
15.2k23181
1
-
Pretty nice! Any possibility for conversion to a one-header & one-source version?
– Brent
Sep 25 ’16 at 20:23
0
Here’s my UTF-8 code from Baby X
(https://github.com/MalcolmMcLean/babyx)
static const unsigned int offsetsFromUTF8[6] =
{
0x00000000UL, 0x00003080UL, 0x000E2080UL,
0x03C82080UL, 0xFA082080UL, 0x82082080UL
};
static const unsigned char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
int bbx_isutf8z(const char *str)
{
int len = 0;
int pos = 0;
int nb;
int i;
int ch;
while(str[len])
len++;
while(pos < len && *str)
{
nb = bbx_utf8_skip(str);
if(nb < 1 || nb > 4)
return 0;
if(pos + nb > len)
return 0;
for(i=1;i<nb;i++)
if( (str[i] & 0xC0) != 0x80 )
return 0;
ch = bbx_utf8_getch(str);
if(ch < 0x80)
{
if(nb != 1)
return 0;
}
else if(ch < 0x8000)
{
if(nb != 2)
return 0;
}
else if(ch < 0x10000)
{
if(nb != 3)
return 0;
}
else if(ch < 0x110000)
{
if(nb != 4)
return 0;
}
pos += nb;
str += nb;
}
return 1;
}
int bbx_utf8_skip(const char *utf8)
{
return trailingBytesForUTF8[(unsigned char) *utf8] + 1;
}
int bbx_utf8_getch(const char *utf8)
{
int ch;
int nb;
nb = trailingBytesForUTF8[(unsigned char)*utf8];
ch = 0;
switch (nb)
{
/* these fall through deliberately */
case 3: ch += (unsigned char)*utf8++; ch <<= 6;
case 2: ch += (unsigned char)*utf8++; ch <<= 6;
case 1: ch += (unsigned char)*utf8++; ch <<= 6;
case 0: ch += (unsigned char)*utf8++;
}
ch -= offsetsFromUTF8[nb];
return ch;
}
int bbx_utf8_putch(char *out, int ch)
{
char *dest = out;
if (ch < 0x80)
{
*dest++ = (char)ch;
}
else if (ch < 0x800)
{
*dest++ = (ch>>6) | 0xC0;
*dest++ = (ch & 0x3F) | 0x80;
}
else if (ch < 0x10000)
{
*dest++ = (ch>>12) | 0xE0;
*dest++ = ((ch>>6) & 0x3F) | 0x80;
*dest++ = (ch & 0x3F) | 0x80;
}
else if (ch < 0x110000)
{
*dest++ = (ch>>18) | 0xF0;
*dest++ = ((ch>>12) & 0x3F) | 0x80;
*dest++ = ((ch>>6) & 0x3F) | 0x80;
*dest++ = (ch & 0x3F) | 0x80;
}
else
return 0;
return dest - out;
}
int bbx_utf8_charwidth(int ch)
{
if (ch < 0x80)
{
return 1;
}
else if (ch < 0x800)
{
return 2;
}
else if (ch < 0x10000)
{
return 3;
}
else if (ch < 0x110000)
{
return 4;
}
else
return 0;
}
int bbx_utf8_Nchars(const char *utf8)
{
int answer = 0;
while(*utf8)
{
utf8 += bbx_utf8_skip(utf8);
answer++;
}
return answer;
}
answered Feb 4 ’18 at 19:39
Malcolm McLeanMalcolm McLean
6,07611318
Your Answer
Thanks for contributing an answer to Stack Overflow!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Name
Email
Required, but never shown
By clicking “Post Your Answer”, you agree to our terms of service, privacy policy and cookie policy
Not the answer you’re looking for? Browse other questions tagged c++ c++11 unicode data-conversion utf or ask your own question.
请登录后查看评论内容