annotate foosdk/sdk/pfc/utf8.cpp @ 1:20d02a178406 default tip

*: check in everything else yay
author Paper <paper@tflc.us>
date Mon, 05 Jan 2026 02:15:46 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
1 #include "pfc-lite.h"
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
2 #include "string_base.h"
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
3
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
4 namespace pfc {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
5 //utf8 stuff
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
6 #include "pocket_char_ops.h"
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
7
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
8 #ifdef _MSC_VER
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
9 t_size utf16_decode_char(const wchar_t * p_source,unsigned * p_out,t_size p_source_length) throw() {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
10 PFC_STATIC_ASSERT( sizeof(wchar_t) == sizeof(char16_t) );
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
11 return wide_decode_char( p_source, p_out, p_source_length );
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
12 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
13 t_size utf16_encode_char(unsigned c,wchar_t * out) throw() {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
14 PFC_STATIC_ASSERT( sizeof(wchar_t) == sizeof(char16_t) );
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
15 return wide_encode_char( c, out );
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
16 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
17 #endif
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
18
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
19 t_size wide_decode_char(const wchar_t * p_source,unsigned * p_out,t_size p_source_length) throw() {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
20 PFC_STATIC_ASSERT( sizeof( wchar_t ) == sizeof( char16_t ) || sizeof( wchar_t ) == sizeof( unsigned ) );
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
21 if constexpr (sizeof( wchar_t ) == sizeof( char16_t ) ) {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
22 return utf16_decode_char( reinterpret_cast< const char16_t *>(p_source), p_out, p_source_length );
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
23 } else {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
24 if (p_source_length == 0) { * p_out = 0; return 0; }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
25 * p_out = p_source [ 0 ];
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
26 return 1;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
27 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
28 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
29 t_size wide_encode_char(unsigned c,wchar_t * out) throw() {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
30 PFC_STATIC_ASSERT( sizeof( wchar_t ) == sizeof( char16_t ) || sizeof( wchar_t ) == sizeof( unsigned ) );
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
31 if constexpr (sizeof( wchar_t ) == sizeof( char16_t ) ) {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
32 return utf16_encode_char( c, reinterpret_cast< char16_t * >(out) );
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
33 } else {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
34 * out = (wchar_t) c;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
35 return 1;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
36 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
37 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
38
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
39 size_t uni_decode_char(const char16_t * p_source, unsigned & p_out, size_t p_source_length) noexcept {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
40 return utf16_decode_char(p_source, &p_out, p_source_length);
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
41 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
42 size_t uni_decode_char(const char * p_source, unsigned & p_out, size_t p_source_length) noexcept {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
43 return utf8_decode_char(p_source, p_out, p_source_length);
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
44 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
45 size_t uni_decode_char(const wchar_t * p_source, unsigned & p_out, size_t p_source_length) noexcept {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
46 if constexpr ( sizeof(wchar_t) == sizeof(char16_t)) {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
47 return utf16_decode_char( reinterpret_cast<const char16_t*>(p_source), &p_out, p_source_length);
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
48 } else {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
49 if (p_source_length > 0) {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
50 unsigned c = (unsigned)*p_source;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
51 if (c != 0) {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
52 p_out = c; return 1;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
53 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
54 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
55 p_out = 0; return 0;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
56 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
57 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
58
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
59 size_t uni_char_length(const char * arg) {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
60 return utf8_char_len(arg);
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
61 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
62 size_t uni_char_length(const char16_t * arg) {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
63 unsigned dontcare;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
64 return utf16_decode_char(arg, &dontcare);
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
65 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
66 size_t uni_char_length(const wchar_t * arg) {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
67 if constexpr ( sizeof(wchar_t) == sizeof(char16_t) ) {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
68 unsigned dontcare;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
69 return utf16_decode_char(reinterpret_cast<const char16_t*>(arg), &dontcare);
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
70 } else {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
71 return *arg == 0 ? 0 : 1;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
72 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
73 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
74
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
75 size_t uni_encode_char(unsigned c, char* out) noexcept {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
76 PFC_ASSERT(c != 0);
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
77 return utf8_encode_char(c, out);
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
78 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
79 size_t uni_encode_char(unsigned c, char16_t* out) noexcept {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
80 PFC_ASSERT(c != 0);
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
81 return utf16_encode_char(c, out);
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
82 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
83 size_t uni_encode_char(unsigned c, wchar_t* out) noexcept {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
84 PFC_ASSERT(c != 0);
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
85 if constexpr ( sizeof(wchar_t) == sizeof(char16_t)) {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
86 return utf16_encode_char(c, reinterpret_cast<char16_t*>(out));
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
87 } else {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
88 *out = (wchar_t)c; return 1;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
89 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
90 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
91
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
92
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
93 bool is_lower_ascii(const char * param)
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
94 {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
95 while(*param)
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
96 {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
97 if (*param<0) return false;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
98 param++;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
99 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
100 return true;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
101 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
102
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
103 static bool check_end_of_string(const char * ptr)
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
104 {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
105 return !*ptr;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
106 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
107
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
108 size_t strcpy_utf8_truncate(const char * src,char * out,size_t maxbytes)
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
109 {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
110 size_t rv = 0 , ptr = 0;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
111 if (maxbytes>0)
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
112 {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
113 maxbytes--;//for null
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
114 while(!check_end_of_string(src) && maxbytes>0)
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
115 {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
116 size_t delta = utf8_char_len(src);
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
117 if (delta>maxbytes || delta==0) break;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
118 maxbytes -= delta;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
119 do
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
120 {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
121 out[ptr++] = *(src++);
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
122 } while(--delta);
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
123 rv = ptr;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
124 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
125 out[rv]=0;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
126 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
127 return rv;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
128 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
129
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
130 t_size strlen_utf8(const char * p,t_size num) noexcept
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
131 {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
132 unsigned w;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
133 t_size d;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
134 t_size ret = 0;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
135 for(;num;)
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
136 {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
137 d = utf8_decode_char(p,w);
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
138 if (w==0 || d<=0) break;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
139 ret++;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
140 p+=d;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
141 num-=d;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
142 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
143 return ret;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
144 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
145
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
146 t_size utf8_chars_to_bytes(const char * string,t_size count) noexcept
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
147 {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
148 t_size bytes = 0;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
149 while(count)
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
150 {
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
151 unsigned dummy;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
152 t_size delta = utf8_decode_char(string+bytes,dummy);
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
153 if (delta==0) break;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
154 bytes += delta;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
155 count--;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
156 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
157 return bytes;
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
158 }
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
159
20d02a178406 *: check in everything else
Paper <paper@tflc.us>
parents:
diff changeset
160 }