libidn 1.41
nfkc.c
Go to the documentation of this file.
1/* nfkc.c --- Unicode normalization utilities.
2 Copyright (C) 2002-2022 Simon Josefsson
3
4 This file is part of GNU Libidn.
5
6 GNU Libidn is free software: you can redistribute it and/or
7 modify it under the terms of either:
8
9 * the GNU Lesser General Public License as published by the Free
10 Software Foundation; either version 3 of the License, or (at
11 your option) any later version.
12
13 or
14
15 * the GNU General Public License as published by the Free
16 Software Foundation; either version 2 of the License, or (at
17 your option) any later version.
18
19 or both in parallel, as here.
20
21 GNU Libidn is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
24 General Public License for more details.
25
26 You should have received copies of the GNU General Public License and
27 the GNU Lesser General Public License along with this program. If
28 not, see <https://www.gnu.org/licenses/>. */
29
30#ifdef HAVE_CONFIG_H
31# include "config.h"
32#endif
33
34#include <stdlib.h>
35#include <string.h>
36
37#include "stringprep.h"
38
39/* Hacks to make syncing with GLIB code easier. */
40#define gboolean int
41#define gchar char
42#define guchar unsigned char
43#define gint int
44#define guint unsigned int
45#define gushort unsigned short
46#define gint16 int16_t
47#define guint16 uint16_t
48#define gunichar uint32_t
49#define gsize size_t
50#define gssize ssize_t
51#define g_malloc malloc
52#define g_free free
53#define g_return_val_if_fail(expr,val) { \
54 if (!(expr)) \
55 return (val); \
56 }
57
58/* Code from GLIB gmacros.h starts here. */
59
60/* GLIB - Library of useful routines for C programming
61 * Copyright (C) 1995-1997 Peter Mattis, Spencer Kimball and Josh MacDonald
62 *
63 * This library is free software; you can redistribute it and/or
64 * modify it under the terms of the GNU Lesser General Public
65 * License as published by the Free Software Foundation; either
66 * version 2 of the License, or (at your option) any later version.
67 *
68 * This library is distributed in the hope that it will be useful,
69 * but WITHOUT ANY WARRANTY; without even the implied warranty of
70 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
71 * Lesser General Public License for more details.
72 *
73 * You should have received a copy of the GNU Lesser General Public
74 * License along with this library; if not, write to the
75 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
76 * Boston, MA 02111-1307, USA.
77 */
78
79#ifndef FALSE
80# define FALSE (0)
81#endif
82
83#ifndef TRUE
84# define TRUE (!FALSE)
85#endif
86
87#define G_N_ELEMENTS(arr) (sizeof (arr) / sizeof ((arr)[0]))
88
89#define G_UNLIKELY(expr) (expr)
90
91/* Code from GLIB gunicode.h starts here. */
92
93/* gunicode.h - Unicode manipulation functions
94 *
95 * Copyright (C) 1999, 2000 Tom Tromey
96 * Copyright 2000, 2005 Red Hat, Inc.
97 *
98 * The Gnome Library is free software; you can redistribute it and/or
99 * modify it under the terms of the GNU Lesser General Public License as
100 * published by the Free Software Foundation; either version 2 of the
101 * License, or (at your option) any later version.
102 *
103 * The Gnome Library is distributed in the hope that it will be useful,
104 * but WITHOUT ANY WARRANTY; without even the implied warranty of
105 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
106 * Lesser General Public License for more details.
107 *
108 * You should have received a copy of the GNU Lesser General Public
109 * License along with the Gnome Library; see the file COPYING.LIB. If not,
110 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
111 * Boston, MA 02111-1307, USA.
112 */
113
114typedef enum
115{
126
127#define g_utf8_next_char(p) ((p) + g_utf8_skip[*(const guchar *)(p)])
128
129/* Code from GLIB gutf8.c starts here. */
130
131/* gutf8.c - Operations on UTF-8 strings.
132 *
133 * Copyright (C) 1999 Tom Tromey
134 * Copyright (C) 2000 Red Hat, Inc.
135 *
136 * This library is free software; you can redistribute it and/or
137 * modify it under the terms of the GNU Lesser General Public
138 * License as published by the Free Software Foundation; either
139 * version 2 of the License, or (at your option) any later version.
140 *
141 * This library is distributed in the hope that it will be useful,
142 * but WITHOUT ANY WARRANTY; without even the implied warranty of
143 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
144 * Lesser General Public License for more details.
145 *
146 * You should have received a copy of the GNU Lesser General Public
147 * License along with this library; if not, write to the
148 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
149 * Boston, MA 02111-1307, USA.
150 */
151
152#define UTF8_COMPUTE(Char, Mask, Len) \
153 if (Char < 128) \
154 { \
155 Len = 1; \
156 Mask = 0x7f; \
157 } \
158 else if ((Char & 0xe0) == 0xc0) \
159 { \
160 Len = 2; \
161 Mask = 0x1f; \
162 } \
163 else if ((Char & 0xf0) == 0xe0) \
164 { \
165 Len = 3; \
166 Mask = 0x0f; \
167 } \
168 else if ((Char & 0xf8) == 0xf0) \
169 { \
170 Len = 4; \
171 Mask = 0x07; \
172 } \
173 else if ((Char & 0xfc) == 0xf8) \
174 { \
175 Len = 5; \
176 Mask = 0x03; \
177 } \
178 else if ((Char & 0xfe) == 0xfc) \
179 { \
180 Len = 6; \
181 Mask = 0x01; \
182 } \
183 else \
184 Len = -1;
185
186#define UTF8_LENGTH(Char) \
187 ((Char) < 0x80 ? 1 : \
188 ((Char) < 0x800 ? 2 : \
189 ((Char) < 0x10000 ? 3 : \
190 ((Char) < 0x200000 ? 4 : \
191 ((Char) < 0x4000000 ? 5 : 6)))))
192
193#define UTF8_GET(Result, Chars, Count, Mask, Len) \
194 (Result) = (Chars)[0] & (Mask); \
195 for ((Count) = 1; (Count) < (Len); ++(Count)) \
196 { \
197 if (((Chars)[(Count)] & 0xc0) != 0x80) \
198 { \
199 (Result) = -1; \
200 break; \
201 } \
202 (Result) <<= 6; \
203 (Result) |= ((Chars)[(Count)] & 0x3f); \
204 }
205
206static const gchar utf8_skip_data[256] = {
207 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
208 1, 1, 1, 1, 1, 1, 1,
209 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
210 1, 1, 1, 1, 1, 1, 1,
211 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
212 1, 1, 1, 1, 1, 1, 1,
213 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
214 1, 1, 1, 1, 1, 1, 1,
215 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
216 1, 1, 1, 1, 1, 1, 1,
217 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
218 1, 1, 1, 1, 1, 1, 1,
219 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
220 2, 2, 2, 2, 2, 2, 2,
221 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,
222 5, 5, 5, 6, 6, 1, 1
223};
224
225static const gchar *const g_utf8_skip = utf8_skip_data;
226
227/*
228 * g_utf8_strlen:
229 * @p: pointer to the start of a UTF-8 encoded string
230 * @max: the maximum number of bytes to examine. If @max
231 * is less than 0, then the string is assumed to be
232 * nul-terminated. If @max is 0, @p will not be examined and
233 * may be %NULL.
234 *
235 * Computes the length of the string in characters, not including
236 * the terminating nul character.
237 *
238 * Return value: the length of the string in characters
239 **/
240static gsize
241g_utf8_strlen (const gchar * p)
242{
243 gsize len = 0;
244
245 g_return_val_if_fail (p != NULL, 0);
246
247 while (*p)
248 {
249 p = g_utf8_next_char (p);
250 ++len;
251 }
252
253 return len;
254}
255
256/*
257 * g_utf8_get_char:
258 * @p: a pointer to Unicode character encoded as UTF-8
259 *
260 * Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
261 * If @p does not point to a valid UTF-8 encoded character, results are
262 * undefined. If you are not sure that the bytes are complete
263 * valid Unicode characters, you should use g_utf8_get_char_validated()
264 * instead.
265 *
266 * Return value: the resulting character
267 **/
268static gunichar
269g_utf8_get_char (const gchar * p)
270{
271 int i, mask = 0, len;
272 gunichar result;
273 unsigned char c = (unsigned char) *p;
274
275 UTF8_COMPUTE (c, mask, len);
276 if (len == -1)
277 return (gunichar) - 1;
278 UTF8_GET (result, p, i, mask, len);
279
280 return result;
281}
282
283/*
284 * g_unichar_to_utf8:
285 * @c: a Unicode character code
286 * @outbuf: output buffer, must have at least 6 bytes of space.
287 * If %NULL, the length will be computed and returned
288 * and nothing will be written to @outbuf.
289 *
290 * Converts a single character to UTF-8.
291 *
292 * Return value: number of bytes written
293 **/
294static int
295g_unichar_to_utf8 (gunichar c, gchar * outbuf)
296{
297 /* If this gets modified, also update the copy in g_string_insert_unichar() */
298 guint len = 0;
299 int first;
300 int i;
301
302 if (c < 0x80)
303 {
304 first = 0;
305 len = 1;
306 }
307 else if (c < 0x800)
308 {
309 first = 0xc0;
310 len = 2;
311 }
312 else if (c < 0x10000)
313 {
314 first = 0xe0;
315 len = 3;
316 }
317 else if (c < 0x200000)
318 {
319 first = 0xf0;
320 len = 4;
321 }
322 else if (c < 0x4000000)
323 {
324 first = 0xf8;
325 len = 5;
326 }
327 else
328 {
329 first = 0xfc;
330 len = 6;
331 }
332
333 if (outbuf)
334 {
335 for (i = len - 1; i > 0; --i)
336 {
337 outbuf[i] = (c & 0x3f) | 0x80;
338 c >>= 6;
339 }
340 outbuf[0] = c | first;
341 }
342
343 return len;
344}
345
346/*
347 * g_utf8_to_ucs4_fast:
348 * @str: a UTF-8 encoded string
349 * @len: the maximum length of @str to use, in bytes. If @len < 0,
350 * then the string is nul-terminated.
351 * @items_written: location to store the number of characters in the
352 * result, or %NULL.
353 *
354 * Convert a string from UTF-8 to a 32-bit fixed width
355 * representation as UCS-4, assuming valid UTF-8 input.
356 * This function is roughly twice as fast as g_utf8_to_ucs4()
357 * but does no error checking on the input. A trailing 0 character
358 * will be added to the string after the converted text.
359 *
360 * Return value: a pointer to a newly allocated UCS-4 string.
361 * This value must be freed with g_free().
362 **/
363static gunichar *
364g_utf8_to_ucs4_fast (const gchar * str, gssize len, gsize * items_written)
365{
366 gunichar *result;
367 gsize n_chars, i;
368 const gchar *p;
369
370 g_return_val_if_fail (str != NULL, NULL);
371
372 p = str;
373 n_chars = 0;
374 if (len < 0)
375 {
376 while (*p)
377 {
378 p = g_utf8_next_char (p);
379 ++n_chars;
380 }
381 }
382 else
383 {
384 while (p < str + len && *p)
385 {
386 p = g_utf8_next_char (p);
387 ++n_chars;
388 }
389 }
390
391 result = g_malloc (sizeof (gunichar) * (n_chars + 1));
392 if (!result)
393 return NULL;
394
395 p = str;
396 for (i = 0; i < n_chars; i++)
397 {
398 gunichar wc = (guchar) * p++;
399
400 if (wc < 0x80)
401 {
402 result[i] = wc;
403 }
404 else
405 {
406 gunichar mask = 0x40;
407
408 if (G_UNLIKELY ((wc & mask) == 0))
409 {
410 /* It's an out-of-sequence 10xxxxxxx byte.
411 * Rather than making an ugly hash of this and the next byte
412 * and overrunning the buffer, it's more useful to treat it
413 * with a replacement character */
414 result[i] = 0xfffd;
415 continue;
416 }
417
418 do
419 {
420 wc <<= 6;
421 wc |= (guchar) (*p++) & 0x3f;
422 mask <<= 5;
423 }
424 while ((wc & mask) != 0);
425
426 wc &= mask - 1;
427
428 result[i] = wc;
429 }
430 }
431 result[i] = 0;
432
433 if (items_written)
434 *items_written = i;
435
436 return result;
437}
438
439/*
440 * g_ucs4_to_utf8:
441 * @str: a UCS-4 encoded string
442 * @len: the maximum length (number of characters) of @str to use.
443 * If @len < 0, then the string is nul-terminated.
444 * @items_read: location to store number of characters read, or %NULL.
445 * @items_written: location to store number of bytes written or %NULL.
446 * The value here stored does not include the trailing 0
447 * byte.
448 * @error: location to store the error occurring, or %NULL to ignore
449 * errors. Any of the errors in #GConvertError other than
450 * %G_CONVERT_ERROR_NO_CONVERSION may occur.
451 *
452 * Convert a string from a 32-bit fixed width representation as UCS-4.
453 * to UTF-8. The result will be terminated with a 0 byte.
454 *
455 * Return value: a pointer to a newly allocated UTF-8 string.
456 * This value must be freed with g_free(). If an
457 * error occurs, %NULL will be returned and
458 * @error set. In that case, @items_read will be
459 * set to the position of the first invalid input
460 * character.
461 **/
462static gchar *
463g_ucs4_to_utf8 (const gunichar * str,
464 gsize len, gsize * items_read, gsize * items_written)
465{
466 gint result_length;
467 gchar *result = NULL;
468 gchar *p;
469 gsize i;
470
471 result_length = 0;
472 for (i = 0; len < 0 || i < len; i++)
473 {
474 if (!str[i])
475 break;
476
477 if (str[i] >= 0x80000000)
478 goto err_out;
479
480 result_length += UTF8_LENGTH (str[i]);
481 }
482
483 result = g_malloc (result_length + 1);
484 if (!result)
485 return NULL;
486 p = result;
487
488 i = 0;
489 while (p < result + result_length)
490 p += g_unichar_to_utf8 (str[i++], p);
491
492 *p = '\0';
493
494 if (items_written)
495 *items_written = p - result;
496
497err_out:
498 if (items_read)
499 *items_read = i;
500
501 return result;
502}
503
504/* Code from GLIB gunidecomp.c starts here. */
505
506/* decomp.c - Character decomposition.
507 *
508 * Copyright (C) 1999, 2000 Tom Tromey
509 * Copyright 2000 Red Hat, Inc.
510 *
511 * The Gnome Library is free software; you can redistribute it and/or
512 * modify it under the terms of the GNU Lesser General Public License as
513 * published by the Free Software Foundation; either version 2 of the
514 * License, or (at your option) any later version.
515 *
516 * The Gnome Library is distributed in the hope that it will be useful,
517 * but WITHOUT ANY WARRANTY; without even the implied warranty of
518 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
519 * Lesser General Public License for more details.
520 *
521 * You should have received a copy of the GNU Lesser General Public
522 * License along with the Gnome Library; see the file COPYING.LIB. If not,
523 * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
524 * Boston, MA 02111-1307, USA.
525 */
526
527#include "gunidecomp.h"
528#include "gunicomp.h"
529
530#define CC_PART1(Page, Char) \
531 ((combining_class_table_part1[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
532 ? (combining_class_table_part1[Page] - G_UNICODE_MAX_TABLE_INDEX) \
533 : (cclass_data[combining_class_table_part1[Page]][Char]))
534
535#define CC_PART2(Page, Char) \
536 ((combining_class_table_part2[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
537 ? (combining_class_table_part2[Page] - G_UNICODE_MAX_TABLE_INDEX) \
538 : (cclass_data[combining_class_table_part2[Page]][Char]))
539
540#define COMBINING_CLASS(Char) \
541 (((Char) <= G_UNICODE_LAST_CHAR_PART1) \
542 ? CC_PART1 ((Char) >> 8, (Char) & 0xff) \
543 : (((Char) >= 0xe0000 && (Char) <= G_UNICODE_LAST_CHAR) \
544 ? CC_PART2 (((Char) - 0xe0000) >> 8, (Char) & 0xff) \
545 : 0))
546
547/* constants for hangul syllable [de]composition */
548#define SBase 0xAC00
549#define LBase 0x1100
550#define VBase 0x1161
551#define TBase 0x11A7
552#define LCount 19
553#define VCount 21
554#define TCount 28
555#define NCount (VCount * TCount)
556#define SCount (LCount * NCount)
557
558/*
559 * g_unicode_canonical_ordering:
560 * @string: a UCS-4 encoded string.
561 * @len: the maximum length of @string to use.
562 *
563 * Computes the canonical ordering of a string in-place.
564 * This rearranges decomposed characters in the string
565 * according to their combining classes. See the Unicode
566 * manual for more information.
567 **/
568static void
569g_unicode_canonical_ordering (gunichar * string, gsize len)
570{
571 gsize i;
572 int swap = 1;
573
574 while (swap)
575 {
576 int last;
577 swap = 0;
578 last = COMBINING_CLASS (string[0]);
579 for (i = 0; i < len - 1; ++i)
580 {
581 int next = COMBINING_CLASS (string[i + 1]);
582 if (next != 0 && last > next)
583 {
584 gsize j;
585 /* Percolate item leftward through string. */
586 for (j = i + 1; j > 0; --j)
587 {
588 gunichar t;
589 if (COMBINING_CLASS (string[j - 1]) <= next)
590 break;
591 t = string[j];
592 string[j] = string[j - 1];
593 string[j - 1] = t;
594 swap = 1;
595 }
596 /* We're re-entering the loop looking at the old
597 character again. */
598 next = last;
599 }
600 last = next;
601 }
602 }
603}
604
605/* http://www.unicode.org/unicode/reports/tr15/#Hangul
606 * r should be null or have sufficient space. Calling with r == NULL will
607 * only calculate the result_len; however, a buffer with space for three
608 * characters will always be big enough. */
609static void
610decompose_hangul (gunichar s, gunichar * r, gsize * result_len)
611{
612 gint SIndex = s - SBase;
613 gint TIndex = SIndex % TCount;
614
615 if (r)
616 {
617 r[0] = LBase + SIndex / NCount;
618 r[1] = VBase + (SIndex % NCount) / TCount;
619 }
620
621 if (TIndex)
622 {
623 if (r)
624 r[2] = TBase + TIndex;
625 *result_len = 3;
626 }
627 else
628 *result_len = 2;
629}
630
631/* returns a pointer to a null-terminated UTF-8 string */
632static const gchar *
633find_decomposition (gunichar ch, gboolean compat)
634{
635 int start = 0;
636 int end = G_N_ELEMENTS (decomp_table);
637
638 if (ch >= decomp_table[start].ch && ch <= decomp_table[end - 1].ch)
639 {
640 while (TRUE)
641 {
642 int half = (start + end) / 2;
643 if (ch == decomp_table[half].ch)
644 {
645 int offset;
646
647 if (compat)
648 {
649 offset = decomp_table[half].compat_offset;
650 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
651 offset = decomp_table[half].canon_offset;
652 }
653 else
654 {
655 offset = decomp_table[half].canon_offset;
656 if (offset == G_UNICODE_NOT_PRESENT_OFFSET)
657 return NULL;
658 }
659
660 return &(decomp_expansion_string[offset]);
661 }
662 else if (half == start)
663 break;
664 else if (ch > decomp_table[half].ch)
665 start = half;
666 else
667 end = half;
668 }
669 }
670
671 return NULL;
672}
673
674/* L,V => LV and LV,T => LVT */
675static gboolean
676combine_hangul (gunichar a, gunichar b, gunichar * result)
677{
678 if (a >= LBase && a < LCount + LBase && b >= VBase && b < VCount + VBase)
679 {
680 gint LIndex = a - LBase;
681 gint VIndex = b - VBase;
682
683 *result = SBase + (LIndex * VCount + VIndex) * TCount;
684 return TRUE;
685 }
686
687 if (a >= SBase && a < SCount + SBase && b > TBase && b < TCount + TBase)
688 {
689 gint SIndex = a - SBase;
690
691 if ((SIndex % TCount) == 0)
692 {
693 gint TIndex = b - TBase;
694
695 *result = a + TIndex;
696 return TRUE;
697 }
698 }
699
700 return FALSE;
701}
702
703#define CI(Page, Char) \
704 ((compose_table[Page] >= G_UNICODE_MAX_TABLE_INDEX) \
705 ? (compose_table[Page] - G_UNICODE_MAX_TABLE_INDEX) \
706 : (compose_data[compose_table[Page]][Char]))
707
708#define COMPOSE_INDEX(Char) \
709 (((Char >> 8) > (COMPOSE_TABLE_LAST)) ? 0 : CI((Char) >> 8, (Char) & 0xff))
710
711static gboolean
712combine (gunichar a, gunichar b, gunichar * result)
713{
714 gushort index_a, index_b;
715
716 if (combine_hangul (a, b, result))
717 return TRUE;
718
719 index_a = COMPOSE_INDEX (a);
720
721 if (index_a >= COMPOSE_FIRST_SINGLE_START && index_a < COMPOSE_SECOND_START)
722 {
723 if (b == compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][0])
724 {
725 *result =
726 compose_first_single[index_a - COMPOSE_FIRST_SINGLE_START][1];
727 return TRUE;
728 }
729 else
730 return FALSE;
731 }
732
733 index_b = COMPOSE_INDEX (b);
734
735 if (index_b >= COMPOSE_SECOND_SINGLE_START)
736 {
737 if (a ==
738 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][0])
739 {
740 *result =
741 compose_second_single[index_b - COMPOSE_SECOND_SINGLE_START][1];
742 return TRUE;
743 }
744 else
745 return FALSE;
746 }
747
748 if (index_a >= COMPOSE_FIRST_START && index_a < COMPOSE_FIRST_SINGLE_START
749 && index_b >= COMPOSE_SECOND_START
750 && index_b < COMPOSE_SECOND_SINGLE_START)
751 {
752 gunichar res =
753 compose_array[index_a - COMPOSE_FIRST_START][index_b -
755
756 if (res)
757 {
758 *result = res;
759 return TRUE;
760 }
761 }
762
763 return FALSE;
764}
765
766static gunichar *
767_g_utf8_normalize_wc (const gchar * str, gssize max_len, GNormalizeMode mode)
768{
769 gsize n_wc;
770 gunichar *wc_buffer;
771 const char *p;
772 gsize last_start;
773 gboolean do_compat = (mode == G_NORMALIZE_NFKC || mode == G_NORMALIZE_NFKD);
774 gboolean do_compose = (mode == G_NORMALIZE_NFC || mode == G_NORMALIZE_NFKC);
775
776 n_wc = 0;
777 p = str;
778 while ((max_len < 0 || p < str + max_len) && *p)
779 {
780 const gchar *decomp;
781 gunichar wc = g_utf8_get_char (p);
782
783 if (wc >= SBase && wc < SBase + SCount)
784 {
785 gsize result_len;
786 decompose_hangul (wc, NULL, &result_len);
787 n_wc += result_len;
788 }
789 else
790 {
791 decomp = find_decomposition (wc, do_compat);
792
793 if (decomp)
794 n_wc += g_utf8_strlen (decomp);
795 else
796 n_wc++;
797 }
798
799 p = g_utf8_next_char (p);
800 }
801
802 wc_buffer = g_malloc (sizeof (gunichar) * (n_wc + 1));
803 if (!wc_buffer)
804 return NULL;
805
806 last_start = 0;
807 n_wc = 0;
808 p = str;
809 while ((max_len < 0 || p < str + max_len) && *p)
810 {
811 gunichar wc = g_utf8_get_char (p);
812 const gchar *decomp;
813 int cc;
814 gsize old_n_wc = n_wc;
815
816 if (wc >= SBase && wc < SBase + SCount)
817 {
818 gsize result_len;
819 decompose_hangul (wc, wc_buffer + n_wc, &result_len);
820 n_wc += result_len;
821 }
822 else
823 {
824 decomp = find_decomposition (wc, do_compat);
825
826 if (decomp)
827 {
828 const char *pd;
829 for (pd = decomp; *pd != '\0'; pd = g_utf8_next_char (pd))
830 wc_buffer[n_wc++] = g_utf8_get_char (pd);
831 }
832 else
833 wc_buffer[n_wc++] = wc;
834 }
835
836 if (n_wc > 0)
837 {
838 cc = COMBINING_CLASS (wc_buffer[old_n_wc]);
839
840 if (cc == 0)
841 {
842 g_unicode_canonical_ordering (wc_buffer + last_start,
843 n_wc - last_start);
844 last_start = old_n_wc;
845 }
846 }
847
848 p = g_utf8_next_char (p);
849 }
850
851 if (n_wc > 0)
852 {
853 g_unicode_canonical_ordering (wc_buffer + last_start,
854 n_wc - last_start);
855 /* dead assignment: last_start = n_wc; */
856 }
857
858 wc_buffer[n_wc] = 0;
859
860 /* All decomposed and reordered */
861
862 if (do_compose && n_wc > 0)
863 {
864 gsize i, j;
865 int last_cc = 0;
866 last_start = 0;
867
868 for (i = 0; i < n_wc; i++)
869 {
870 int cc = COMBINING_CLASS (wc_buffer[i]);
871
872 if (i > 0 &&
873 (last_cc == 0 || last_cc != cc) &&
874 combine (wc_buffer[last_start], wc_buffer[i],
875 &wc_buffer[last_start]))
876 {
877 for (j = i + 1; j < n_wc; j++)
878 wc_buffer[j - 1] = wc_buffer[j];
879 n_wc--;
880 i--;
881
882 if (i == last_start)
883 last_cc = 0;
884 else
885 last_cc = COMBINING_CLASS (wc_buffer[i - 1]);
886
887 continue;
888 }
889
890 if (cc == 0)
891 last_start = i;
892
893 last_cc = cc;
894 }
895 }
896
897 wc_buffer[n_wc] = 0;
898
899 return wc_buffer;
900}
901
902/*
903 * g_utf8_normalize:
904 * @str: a UTF-8 encoded string.
905 * @len: length of @str, in bytes, or -1 if @str is nul-terminated.
906 * @mode: the type of normalization to perform.
907 *
908 * Converts a string into canonical form, standardizing
909 * such issues as whether a character with an accent
910 * is represented as a base character and combining
911 * accent or as a single precomposed character. The
912 * string has to be valid UTF-8, otherwise %NULL is
913 * returned. You should generally call g_utf8_normalize()
914 * before comparing two Unicode strings.
915 *
916 * The normalization mode %G_NORMALIZE_DEFAULT only
917 * standardizes differences that do not affect the
918 * text content, such as the above-mentioned accent
919 * representation. %G_NORMALIZE_ALL also standardizes
920 * the "compatibility" characters in Unicode, such
921 * as SUPERSCRIPT THREE to the standard forms
922 * (in this case DIGIT THREE). Formatting information
923 * may be lost but for most text operations such
924 * characters should be considered the same.
925 *
926 * %G_NORMALIZE_DEFAULT_COMPOSE and %G_NORMALIZE_ALL_COMPOSE
927 * are like %G_NORMALIZE_DEFAULT and %G_NORMALIZE_ALL,
928 * but returned a result with composed forms rather
929 * than a maximally decomposed form. This is often
930 * useful if you intend to convert the string to
931 * a legacy encoding or pass it to a system with
932 * less capable Unicode handling.
933 *
934 * Return value: a newly allocated string, that is the
935 * normalized form of @str, or %NULL if @str is not
936 * valid UTF-8.
937 **/
938static gchar *
939g_utf8_normalize (const gchar * str, gssize len, GNormalizeMode mode)
940{
941 gunichar *result_wc = _g_utf8_normalize_wc (str, len, mode);
942 gchar *result = NULL;
943
944 if (result_wc)
945 result = g_ucs4_to_utf8 (result_wc, -1, NULL, NULL);
946
947 g_free (result_wc);
948
949 return result;
950}
951
952/* Public Libidn API starts here. */
953
964uint32_t
966{
967 return g_utf8_get_char (p);
968}
969
981int
982stringprep_unichar_to_utf8 (uint32_t c, char *outbuf)
983{
984 return g_unichar_to_utf8 (c, outbuf);
985}
986
987#include <unistr.h>
988
1005uint32_t *
1006stringprep_utf8_to_ucs4 (const char *str, ssize_t len, size_t *items_written)
1007{
1008 size_t n;
1009
1010 if (len < 0)
1011 n = strlen (str);
1012 else
1013 n = len;
1014
1015 if (u8_check ((const uint8_t *) str, n))
1016 return NULL;
1017
1018 return g_utf8_to_ucs4_fast (str, len, items_written);
1019}
1020
1038char *
1039stringprep_ucs4_to_utf8 (const uint32_t * str, ssize_t len,
1040 size_t *items_read, size_t *items_written)
1041{
1042 return g_ucs4_to_utf8 (str, len, items_read, items_written);
1043}
1044
1067char *
1068stringprep_utf8_nfkc_normalize (const char *str, ssize_t len)
1069{
1070 size_t n;
1071
1072 if (len < 0)
1073 n = strlen (str);
1074 else
1075 n = len;
1076
1077 if (u8_check ((const uint8_t *) str, n))
1078 return NULL;
1079
1080 return g_utf8_normalize (str, len, G_NORMALIZE_NFKC);
1081}
1082
1083#include <stdio.h>
1095uint32_t *
1096stringprep_ucs4_nfkc_normalize (const uint32_t * str, ssize_t len)
1097{
1098 char *p;
1099 uint32_t *result_wc;
1100
1101 p = stringprep_ucs4_to_utf8 (str, len, 0, 0);
1102 if (!p)
1103 return NULL;
1104
1105 result_wc = _g_utf8_normalize_wc (p, -1, G_NORMALIZE_NFKC);
1106 free (p);
1107
1108 return result_wc;
1109}
#define COMPOSE_SECOND_SINGLE_START
Definition: gunicomp.h:8
#define COMPOSE_SECOND_START
Definition: gunicomp.h:7
#define COMPOSE_FIRST_START
Definition: gunicomp.h:5
#define COMPOSE_FIRST_SINGLE_START
Definition: gunicomp.h:6
#define G_UNICODE_NOT_PRESENT_OFFSET
Definition: gunidecomp.h:16
#define g_return_val_if_fail(expr, val)
Definition: nfkc.c:53
#define SCount
Definition: nfkc.c:556
#define gssize
Definition: nfkc.c:50
#define gushort
Definition: nfkc.c:45
char * stringprep_ucs4_to_utf8(const uint32_t *str, ssize_t len, size_t *items_read, size_t *items_written)
Definition: nfkc.c:1039
#define UTF8_COMPUTE(Char, Mask, Len)
Definition: nfkc.c:152
int stringprep_unichar_to_utf8(uint32_t c, char *outbuf)
Definition: nfkc.c:982
#define gunichar
Definition: nfkc.c:48
#define COMPOSE_INDEX(Char)
Definition: nfkc.c:708
uint32_t * stringprep_ucs4_nfkc_normalize(const uint32_t *str, ssize_t len)
Definition: nfkc.c:1096
#define guint
Definition: nfkc.c:44
#define g_free
Definition: nfkc.c:52
#define G_N_ELEMENTS(arr)
Definition: nfkc.c:87
#define gchar
Definition: nfkc.c:41
#define LBase
Definition: nfkc.c:549
#define gint
Definition: nfkc.c:43
#define UTF8_LENGTH(Char)
Definition: nfkc.c:186
char * stringprep_utf8_nfkc_normalize(const char *str, ssize_t len)
Definition: nfkc.c:1068
#define g_utf8_next_char(p)
Definition: nfkc.c:127
#define TRUE
Definition: nfkc.c:84
#define FALSE
Definition: nfkc.c:80
#define G_UNLIKELY(expr)
Definition: nfkc.c:89
#define TBase
Definition: nfkc.c:551
#define UTF8_GET(Result, Chars, Count, Mask, Len)
Definition: nfkc.c:193
#define VBase
Definition: nfkc.c:550
uint32_t stringprep_utf8_to_unichar(const char *p)
Definition: nfkc.c:965
#define COMBINING_CLASS(Char)
Definition: nfkc.c:540
#define NCount
Definition: nfkc.c:555
#define guchar
Definition: nfkc.c:42
#define g_malloc
Definition: nfkc.c:51
GNormalizeMode
Definition: nfkc.c:115
@ G_NORMALIZE_DEFAULT_COMPOSE
Definition: nfkc.c:118
@ G_NORMALIZE_NFKC
Definition: nfkc.c:123
@ G_NORMALIZE_NFKD
Definition: nfkc.c:121
@ G_NORMALIZE_ALL
Definition: nfkc.c:120
@ G_NORMALIZE_NFD
Definition: nfkc.c:117
@ G_NORMALIZE_DEFAULT
Definition: nfkc.c:116
@ G_NORMALIZE_ALL_COMPOSE
Definition: nfkc.c:122
@ G_NORMALIZE_NFC
Definition: nfkc.c:119
uint32_t * stringprep_utf8_to_ucs4(const char *str, ssize_t len, size_t *items_written)
Definition: nfkc.c:1006
#define SBase
Definition: nfkc.c:548
#define TCount
Definition: nfkc.c:554
#define gsize
Definition: nfkc.c:49
#define VCount
Definition: nfkc.c:553
#define gboolean
Definition: nfkc.c:40