Pyrogenesis  13997
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
utf8.cpp
Go to the documentation of this file.
1 /* Copyright (c) 2010 Wildfire Games
2  *
3  * Permission is hereby granted, free of charge, to any person obtaining
4  * a copy of this software and associated documentation files (the
5  * "Software"), to deal in the Software without restriction, including
6  * without limitation the rights to use, copy, modify, merge, publish,
7  * distribute, sublicense, and/or sell copies of the Software, and to
8  * permit persons to whom the Software is furnished to do so, subject to
9  * the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included
12  * in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
17  * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
18  * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
19  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
20  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21  */
22 
23 #include "precompiled.h"
24 #include "lib/utf8.h"
25 
27  { ERR::UTF8_SURROGATE, L"UTF-16 surrogate pairs aren't supported" },
28  { ERR::UTF8_OUTSIDE_BMP, L"Code point outside BMP (> 0x10000)" },
29  { ERR::UTF8_NONCHARACTER, L"Noncharacter (e.g. WEOF)" },
30  { ERR::UTF8_INVALID_UTF8, L"Invalid UTF-8 sequence" }
31 };
32 STATUS_ADD_DEFINITIONS(utf8StatusDefinitions);
33 
34 
35 // adapted from http://unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
36 // which bears the following notice:
37 /*
38 * Copyright 2001-2004 Unicode, Inc.
39 *
40 * Disclaimer
41 *
42 * This source code is provided as is by Unicode, Inc. No claims are
43 * made as to fitness for any particular purpose. No warranties of any
44 * kind are expressed or implied. The recipient agrees to determine
45 * applicability of information provided. If this file has been
46 * purchased on magnetic or optical media from Unicode, Inc., the
47 * sole remedy for any claim will be exchange of defective media
48 * within 90 days of receipt.
49 *
50 * Limitations on Rights to Redistribute This Code
51 *
52 * Unicode, Inc. hereby grants the right to freely use the information
53 * supplied in this file in the creation of products supporting the
54 * Unicode Standard, and to make copies of this file in any form
55 * for internal or external distribution as long as this notice
56 * remains attached.
57 */
58 
59 // design rationale:
60 // - to cope with wchar_t differences between VC (UTF-16) and
61 // GCC (UCS-4), we only allow codepoints in the BMP.
62 // encoded UTF-8 sequences are therefore no longer than 3 bytes.
63 // - surrogates are disabled because variable-length strings
64 // violate the purpose of using wchar_t instead of UTF-8.
65 // - replacing disallowed characters instead of aborting outright
66 // avoids overly inconveniencing users and eases debugging.
67 
68 // this implementation survives http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
69 
70 // (must be unsigned to avoid sign extension)
71 typedef u8 UTF8;
72 typedef u32 UTF32;
73 
74 
75 // called from ReplaceIfInvalid and UTF8Codec::Decode
76 static UTF32 RaiseError(Status err, Status* perr)
77 {
78  if(perr) // caller wants return code, not warning dialog
79  {
80  if(*perr == INFO::OK) // only return the first error (see header)
81  *perr = err;
82  }
83  else
84  DEBUG_WARN_ERR(err);
85 
86  return 0xFFFDul; // replacement character
87 }
88 
89 
91 {
92  // disallow surrogates
93  if(0xD800ul <= u && u <= 0xDFFFul)
94  return RaiseError(ERR::UTF8_SURROGATE, err);
95  // outside BMP (UTF-16 representation would require surrogates)
96  if(u > 0xFFFFul)
97  return RaiseError(ERR::UTF8_OUTSIDE_BMP, err);
98  // noncharacter (note: WEOF (0xFFFF) causes VC's swprintf to fail)
99  if(u == 0xFFFEul || u == 0xFFFFul || (0xFDD0ul <= u && u <= 0xFDEFul))
100  return RaiseError(ERR::UTF8_NONCHARACTER, err);
101  return u;
102 }
103 
104 
106 {
107 public:
108  static void Encode(UTF32 u, UTF8*& dstPos)
109  {
110  switch (Size(u))
111  {
112  case 1:
113  *dstPos++ = UTF8(u);
114  break;
115  case 2:
116  *dstPos++ = UTF8((u >> 6) | 0xC0);
117  *dstPos++ = UTF8((u | 0x80u) & 0xBFu);
118  break;
119  case 3:
120  *dstPos++ = UTF8((u >> 12) | 0xE0);
121  *dstPos++ = UTF8(((u >> 6) | 0x80u) & 0xBFu);
122  *dstPos++ = UTF8((u | 0x80u) & 0xBFu);
123  break;
124  }
125  }
126 
127  // @return decoded scalar, or replacementCharacter on error
128  static UTF32 Decode(const UTF8*& srcPos, const UTF8* const srcEnd, Status* err)
129  {
130  const size_t size = SizeFromFirstByte(*srcPos);
131  if(!IsValid(srcPos, size, srcEnd))
132  {
133  srcPos += 1; // only skip the offending byte (increases chances of resynchronization)
134  return RaiseError(ERR::UTF8_INVALID_UTF8, err);
135  }
136 
137  UTF32 u = 0;
138  for(size_t i = 0; i < size-1; i++)
139  {
140  u += UTF32(*srcPos++);
141  u <<= 6;
142  }
143  u += UTF32(*srcPos++);
144 
145  static const UTF32 offsets[1+4] = { 0, 0x00000000ul, 0x00003080ul, 0x000E2080ul, 0x03C82080UL };
146  u -= offsets[size];
147  return u;
148  }
149 
150 private:
151  static inline size_t Size(UTF32 u)
152  {
153  if(u < 0x80)
154  return 1;
155  if(u < 0x800)
156  return 2;
157  // ReplaceIfInvalid ensures > 3 byte encodings are never used.
158  return 3;
159  }
160 
161  static inline size_t SizeFromFirstByte(UTF8 firstByte)
162  {
163  if(firstByte < 0xC0)
164  return 1;
165  if(firstByte < 0xE0)
166  return 2;
167  if(firstByte < 0xF0)
168  return 3;
169  // IsValid rejects firstByte values that would cause > 4 byte encodings.
170  return 4;
171  }
172 
173  // c.f. Unicode 3.1 Table 3-7
174  // @param size obtained via SizeFromFirstByte (our caller also uses it)
175  static bool IsValid(const UTF8* const src, size_t size, const UTF8* const srcEnd)
176  {
177  if(src+size > srcEnd) // not enough data
178  return false;
179 
180  if(src[0] < 0x80)
181  return true;
182  if(!(0xC2 <= src[0] && src[0] <= 0xF4))
183  return false;
184 
185  // special cases (stricter than the loop)
186  if(src[0] == 0xE0 && src[1] < 0xA0)
187  return false;
188  if(src[0] == 0xED && src[1] > 0x9F)
189  return false;
190  if(src[0] == 0xF0 && src[1] < 0x90)
191  return false;
192  if(src[0] == 0xF4 && src[1] > 0x8F)
193  return false;
194 
195  for(size_t i = 1; i < size; i++)
196  {
197  if(!(0x80 <= src[i] && src[i] <= 0xBF))
198  return false;
199  }
200 
201  return true;
202  }
203 };
204 
205 
206 //-----------------------------------------------------------------------------
207 
208 std::string utf8_from_wstring(const std::wstring& src, Status* err)
209 {
210  if(err)
211  *err = INFO::OK;
212 
213  std::string dst(src.size()*3+1, ' '); // see UTF8Codec::Size; +1 ensures &dst[0] is valid
214  UTF8* dstPos = (UTF8*)&dst[0];
215  for(size_t i = 0; i < src.size(); i++)
216  {
217  const UTF32 u = ReplaceIfInvalid(UTF32(src[i]), err);
218  UTF8Codec::Encode(u, dstPos);
219  }
220  dst.resize(dstPos - (UTF8*)&dst[0]);
221  return dst;
222 }
223 
224 
225 std::wstring wstring_from_utf8(const std::string& src, Status* err)
226 {
227  if(err)
228  *err = INFO::OK;
229 
230  std::wstring dst;
231  dst.reserve(src.size());
232  const UTF8* srcPos = (const UTF8*)src.data();
233  const UTF8* const srcEnd = srcPos + src.size();
234  while(srcPos < srcEnd)
235  {
236  const UTF32 u = UTF8Codec::Decode(srcPos, srcEnd, err);
237  dst.push_back((wchar_t)ReplaceIfInvalid(u, err));
238  }
239  return dst;
240 }
#define u8
Definition: types.h:39
const Status UTF8_INVALID_UTF8
Definition: utf8.h:32
const Status OK
Definition: status.h:386
std::string utf8_from_wstring(const std::wstring &src, Status *err)
opposite of wstring_from_utf8
Definition: utf8.cpp:208
static UTF32 Decode(const UTF8 *&srcPos, const UTF8 *const srcEnd, Status *err)
Definition: utf8.cpp:128
static size_t SizeFromFirstByte(UTF8 firstByte)
Definition: utf8.cpp:161
u8 UTF8
Definition: utf8.cpp:71
static UTF32 ReplaceIfInvalid(UTF32 u, Status *err)
Definition: utf8.cpp:90
static void Encode(UTF32 u, UTF8 *&dstPos)
Definition: utf8.cpp:108
u32 UTF32
Definition: utf8.cpp:72
i64 Status
Error handling system.
Definition: status.h:171
static bool IsValid(const UTF8 *const src, size_t size, const UTF8 *const srcEnd)
Definition: utf8.cpp:175
#define STATUS_ADD_DEFINITIONS(definitions)
add a module&#39;s array of StatusDefinition to the list.
Definition: status.h:216
#define DEBUG_WARN_ERR(status)
display the error dialog with text corresponding to the given error code.
Definition: debug.h:331
std::wstring wstring_from_utf8(const std::string &src, Status *err)
convert UTF-8 to a wide string (UTF-16 or UCS-4, depending on the platform&#39;s wchar_t).
Definition: utf8.cpp:225
#define u32
Definition: types.h:41
const Status UTF8_OUTSIDE_BMP
Definition: utf8.h:30
const Status UTF8_SURROGATE
Definition: utf8.h:29
static size_t Size(UTF32 u)
Definition: utf8.cpp:151
static const StatusDefinition utf8StatusDefinitions[]
Definition: utf8.cpp:26
static UTF32 RaiseError(Status err, Status *perr)
Definition: utf8.cpp:76
const Status UTF8_NONCHARACTER
Definition: utf8.h:31