json: Reject invalid UTF-8 sequences

qemu with hax to log dma reads & writes jcs.org/2018/11/12/vfio

We reject bytes that can't occur in valid UTF-8 (\xC0..\xC1,
\xF5..\xFF in the lexer. That's insufficient; there's plenty of
invalid UTF-8 not containing these bytes, as demonstrated by
check-qjson:

* Malformed sequences

- Unexpected continuation bytes

- Missing continuation bytes after start bytes other than
\xC0..\xC1, \xF5..\xFD.

* Overlong sequences with start bytes other than \xC0..\xC1,
\xF5..\xFD.

* Invalid code points

Fixing this in the lexer would be bothersome. Fixing it in the parser
is straightforward, so do that.

Signed-off-by: Markus Armbruster <armbru@redhat.com>
Reviewed-by: Eric Blake <eblake@redhat.com>
Message-Id: <20180823164025.12553-23-armbru@redhat.com>

Markus Armbruster 7 years ago e59f39d4 a89d3104

+122 -105

4 changed files

expand all

include

qemu

unicode.h

qobject

json-parser.c

tests

check-qjson.c

util

unicode.c

include/qemu/unicode.h

··· 2 2 #define QEMU_UNICODE_H 3 3 4 4 int mod_utf8_codepoint(const char *s, size_t n, char **end); 5 + ssize_t mod_utf8_encode(char buf[], size_t bufsz, int codepoint); 5 6 6 7 #endif

+14 -6

qobject/json-parser.c

··· 13 13 14 14 #include "qemu/osdep.h" 15 15 #include "qemu/cutils.h" 16 + #include "qemu/unicode.h" 16 17 #include "qapi/error.h" 17 18 #include "qemu-common.h" 18 19 #include "qapi/qmp/qbool.h" ··· 133 134 const char *ptr = token->str; 134 135 QString *str; 135 136 char quote; 137 + int cp; 138 + char *end; 139 + ssize_t len; 140 + char utf8_buf[5]; 136 141 137 142 assert(*ptr == '"' || *ptr == '\''); 138 143 quote = *ptr++; ··· 194 199 goto out; 195 200 } 196 201 } else { 197 - char dummy[2]; 198 - 199 - dummy[0] = *ptr++; 200 - dummy[1] = 0; 201 - 202 - qstring_append(str, dummy); 202 + cp = mod_utf8_codepoint(ptr, 6, &end); 203 + if (cp <= 0) { 204 + parse_error(ctxt, token, "invalid UTF-8 sequence in string"); 205 + goto out; 206 + } 207 + ptr = end; 208 + len = mod_utf8_encode(utf8_buf, sizeof(utf8_buf), cp); 209 + assert(len >= 0); 210 + qstring_append(str, utf8_buf); 203 211 } 204 212 } 205 213

+45 -92

tests/check-qjson.c

··· 152 152 static void utf8_string(void) 153 153 { 154 154 /* 155 - * FIXME Current behavior for invalid UTF-8 sequences is 156 - * incorrect. This test expects current, incorrect results. 157 - * They're all marked "bug:" below, and are to be replaced by 158 - * correct ones as the bugs get fixed. 159 - * 160 - * The JSON parser rejects some, but not all invalid sequences. 161 - * 162 155 * Problem: we can't easily deal with embedded U+0000. Parsing 163 156 * the JSON string "this \\u0000" is fun" yields "this \0 is fun", 164 157 * which gets misinterpreted as NUL-terminated "this ". We should ··· 177 170 /* Expected unparse output, defaults to @json_in */ 178 171 const char *json_out; 179 172 } test_cases[] = { 180 - /* 181 - * Bug markers used here: 182 - * - bug: not rejected 183 - * JSON parser fails to reject invalid sequence(s) 184 - */ 185 - 186 173 /* 0 Control characters */ 187 174 { 188 175 /* ··· 330 317 { 331 318 /* first one beyond Unicode range: U+110000 */ 332 319 "\xF4\x90\x80\x80", 333 - "\xF4\x90\x80\x80", 320 + NULL, 334 321 "\\uFFFD", 335 322 }, 336 323 /* 3 Malformed sequences */ ··· 338 325 /* 3.1.1 First continuation byte */ 339 326 { 340 327 "\x80", 341 - "\x80", /* bug: not rejected */ 328 + NULL, 342 329 "\\uFFFD", 343 330 }, 344 331 /* 3.1.2 Last continuation byte */ 345 332 { 346 333 "\xBF", 347 - "\xBF", /* bug: not rejected */ 334 + NULL, 348 335 "\\uFFFD", 349 336 }, 350 337 /* 3.1.3 2 continuation bytes */ 351 338 { 352 339 "\x80\xBF", 353 - "\x80\xBF", /* bug: not rejected */ 340 + NULL, 354 341 "\\uFFFD\\uFFFD", 355 342 }, 356 343 /* 3.1.4 3 continuation bytes */ 357 344 { 358 345 "\x80\xBF\x80", 359 - "\x80\xBF\x80", /* bug: not rejected */ 346 + NULL, 360 347 "\\uFFFD\\uFFFD\\uFFFD", 361 348 }, 362 349 /* 3.1.5 4 continuation bytes */ 363 350 { 364 351 "\x80\xBF\x80\xBF", 365 - "\x80\xBF\x80\xBF", /* bug: not rejected */ 352 + NULL, 366 353 "\\uFFFD\\uFFFD\\uFFFD\\uFFFD", 367 354 }, 368 355 /* 3.1.6 5 continuation bytes */ 369 356 { 370 357 "\x80\xBF\x80\xBF\x80", 371 - "\x80\xBF\x80\xBF\x80", /* bug: not rejected */ 358 + NULL, 372 359 "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD", 373 360 }, 374 361 /* 3.1.7 6 continuation bytes */ 375 362 { 376 363 "\x80\xBF\x80\xBF\x80\xBF", 377 - "\x80\xBF\x80\xBF\x80\xBF", /* bug: not rejected */ 364 + NULL, 378 365 "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD", 379 366 }, 380 367 /* 3.1.8 7 continuation bytes */ 381 368 { 382 369 "\x80\xBF\x80\xBF\x80\xBF\x80", 383 - "\x80\xBF\x80\xBF\x80\xBF\x80", /* bug: not rejected */ 370 + NULL, 384 371 "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD", 385 372 }, 386 373 /* 3.1.9 Sequence of all 64 possible continuation bytes */ ··· 393 380 "\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF" 394 381 "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7" 395 382 "\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF", 396 - /* bug: not rejected */ 397 - "\x80\x81\x82\x83\x84\x85\x86\x87" 398 - "\x88\x89\x8A\x8B\x8C\x8D\x8E\x8F" 399 - "\x90\x91\x92\x93\x94\x95\x96\x97" 400 - "\x98\x99\x9A\x9B\x9C\x9D\x9E\x9F" 401 - "\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7" 402 - "\xA8\xA9\xAA\xAB\xAC\xAD\xAE\xAF" 403 - "\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7" 404 - "\xB8\xB9\xBA\xBB\xBC\xBD\xBE\xBF", 405 - "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD" 383 + NULL, 406 384 "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD" 407 385 "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD" 408 386 "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD" ··· 410 388 "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD" 411 389 "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD" 412 390 "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD" 391 + "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD", 413 392 }, 414 393 /* 3.2 Lonely start characters */ 415 394 /* 3.2.1 All 32 first bytes of 2-byte sequences, followed by space */ ··· 418 397 "\xC8 \xC9 \xCA \xCB \xCC \xCD \xCE \xCF " 419 398 "\xD0 \xD1 \xD2 \xD3 \xD4 \xD5 \xD6 \xD7 " 420 399 "\xD8 \xD9 \xDA \xDB \xDC \xDD \xDE \xDF ", 421 - NULL, /* bug: accepted partly, see FIXME below */ 400 + NULL, 422 401 "\\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD " 423 402 "\\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD " 424 403 "\\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD " ··· 428 407 { 429 408 "\xE0 \xE1 \xE2 \xE3 \xE4 \xE5 \xE6 \xE7 " 430 409 "\xE8 \xE9 \xEA \xEB \xEC \xED \xEE \xEF ", 431 - /* bug: not rejected */ 432 - "\xE0 \xE1 \xE2 \xE3 \xE4 \xE5 \xE6 \xE7 " 433 - "\xE8 \xE9 \xEA \xEB \xEC \xED \xEE \xEF ", 410 + NULL, 434 411 "\\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD " 435 412 "\\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD ", 436 413 }, 437 414 /* 3.2.3 All 8 first bytes of 4-byte sequences, followed by space */ 438 415 { 439 416 "\xF0 \xF1 \xF2 \xF3 \xF4 \xF5 \xF6 \xF7 ", 440 - NULL, /* bug: accepted partly, see FIXME below */ 417 + NULL, 441 418 "\\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD \\uFFFD ", 442 419 }, 443 420 /* 3.2.4 All 4 first bytes of 5-byte sequences, followed by space */ ··· 462 439 /* 3.3.2 3-byte sequence with last byte missing (U+0000) */ 463 440 { 464 441 "\xE0\x80", 465 - "\xE0\x80", /* bug: not rejected */ 442 + NULL, 466 443 "\\uFFFD", 467 444 }, 468 445 /* 3.3.3 4-byte sequence with last byte missing (U+0000) */ 469 446 { 470 447 "\xF0\x80\x80", 471 - "\xF0\x80\x80", /* bug: not rejected */ 448 + NULL, 472 449 "\\uFFFD", 473 450 }, 474 451 /* 3.3.4 5-byte sequence with last byte missing (U+0000) */ ··· 486 463 /* 3.3.6 2-byte sequence with last byte missing (U+07FF) */ 487 464 { 488 465 "\xDF", 489 - "\xDF", /* bug: not rejected */ 466 + NULL, 490 467 "\\uFFFD", 491 468 }, 492 469 /* 3.3.7 3-byte sequence with last byte missing (U+FFFF) */ 493 470 { 494 471 "\xEF\xBF", 495 - "\xEF\xBF", /* bug: not rejected */ 472 + NULL, 496 473 "\\uFFFD", 497 474 }, 498 475 /* 3.3.8 4-byte sequence with last byte missing (U+1FFFFF) */ ··· 517 494 { 518 495 "\xC0\xE0\x80\xF0\x80\x80\xF8\x80\x80\x80\xFC\x80\x80\x80\x80" 519 496 "\xDF\xEF\xBF\xF7\xBF\xBF\xFB\xBF\xBF\xBF\xFD\xBF\xBF\xBF\xBF", 520 - NULL, /* bug: accepted partly, see FIXME below */ 497 + NULL, 521 498 "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD" 522 499 "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD", 523 500 }, ··· 546 523 }, 547 524 { 548 525 "\xE0\x80\xAF", 549 - "\xE0\x80\xAF", /* bug: not rejected */ 526 + NULL, 550 527 "\\uFFFD", 551 528 }, 552 529 { 553 530 "\xF0\x80\x80\xAF", 554 - "\xF0\x80\x80\xAF", /* bug: not rejected */ 531 + NULL, 555 532 "\\uFFFD", 556 533 }, 557 534 { ··· 579 556 { 580 557 /* \U+07FF */ 581 558 "\xE0\x9F\xBF", 582 - "\xE0\x9F\xBF", /* bug: not rejected */ 559 + NULL, 583 560 "\\uFFFD", 584 561 }, 585 562 { ··· 590 567 * also 2.2.3 591 568 */ 592 569 "\xF0\x8F\xBF\xBC", 593 - "\xF0\x8F\xBF\xBC", /* bug: not rejected */ 570 + NULL, 594 571 "\\uFFFD", 595 572 }, 596 573 { ··· 615 592 { 616 593 /* \U+0000 */ 617 594 "\xE0\x80\x80", 618 - "\xE0\x80\x80", /* bug: not rejected */ 595 + NULL, 619 596 "\\uFFFD", 620 597 }, 621 598 { 622 599 /* \U+0000 */ 623 600 "\xF0\x80\x80\x80", 624 - "\xF0\x80\x80\x80", /* bug: not rejected */ 601 + NULL, 625 602 "\\uFFFD", 626 603 }, 627 604 { ··· 641 618 { 642 619 /* \U+D800 */ 643 620 "\xED\xA0\x80", 644 - "\xED\xA0\x80", /* bug: not rejected */ 621 + NULL, 645 622 "\\uFFFD", 646 623 }, 647 624 { 648 625 /* \U+DB7F */ 649 626 "\xED\xAD\xBF", 650 - "\xED\xAD\xBF", /* bug: not rejected */ 627 + NULL, 651 628 "\\uFFFD", 652 629 }, 653 630 { 654 631 /* \U+DB80 */ 655 632 "\xED\xAE\x80", 656 - "\xED\xAE\x80", /* bug: not rejected */ 633 + NULL, 657 634 "\\uFFFD", 658 635 }, 659 636 { 660 637 /* \U+DBFF */ 661 638 "\xED\xAF\xBF", 662 - "\xED\xAF\xBF", /* bug: not rejected */ 639 + NULL, 663 640 "\\uFFFD", 664 641 }, 665 642 { 666 643 /* \U+DC00 */ 667 644 "\xED\xB0\x80", 668 - "\xED\xB0\x80", /* bug: not rejected */ 645 + NULL, 669 646 "\\uFFFD", 670 647 }, 671 648 { 672 649 /* \U+DF80 */ 673 650 "\xED\xBE\x80", 674 - "\xED\xBE\x80", /* bug: not rejected */ 651 + NULL, 675 652 "\\uFFFD", 676 653 }, 677 654 { 678 655 /* \U+DFFF */ 679 656 "\xED\xBF\xBF", 680 - "\xED\xBF\xBF", /* bug: not rejected */ 657 + NULL, 681 658 "\\uFFFD", 682 659 }, 683 660 /* 5.2 Paired UTF-16 surrogates */ 684 661 { 685 662 /* \U+D800\U+DC00 */ 686 663 "\xED\xA0\x80\xED\xB0\x80", 687 - "\xED\xA0\x80\xED\xB0\x80", /* bug: not rejected */ 664 + NULL, 688 665 "\\uFFFD\\uFFFD", 689 666 }, 690 667 { 691 668 /* \U+D800\U+DFFF */ 692 669 "\xED\xA0\x80\xED\xBF\xBF", 693 - "\xED\xA0\x80\xED\xBF\xBF", /* bug: not rejected */ 670 + NULL, 694 671 "\\uFFFD\\uFFFD", 695 672 }, 696 673 { 697 674 /* \U+DB7F\U+DC00 */ 698 675 "\xED\xAD\xBF\xED\xB0\x80", 699 - "\xED\xAD\xBF\xED\xB0\x80", /* bug: not rejected */ 676 + NULL, 700 677 "\\uFFFD\\uFFFD", 701 678 }, 702 679 { 703 680 /* \U+DB7F\U+DFFF */ 704 681 "\xED\xAD\xBF\xED\xBF\xBF", 705 - "\xED\xAD\xBF\xED\xBF\xBF", /* bug: not rejected */ 682 + NULL, 706 683 "\\uFFFD\\uFFFD", 707 684 }, 708 685 { 709 686 /* \U+DB80\U+DC00 */ 710 687 "\xED\xAE\x80\xED\xB0\x80", 711 - "\xED\xAE\x80\xED\xB0\x80", /* bug: not rejected */ 688 + NULL, 712 689 "\\uFFFD\\uFFFD", 713 690 }, 714 691 { 715 692 /* \U+DB80\U+DFFF */ 716 693 "\xED\xAE\x80\xED\xBF\xBF", 717 - "\xED\xAE\x80\xED\xBF\xBF", /* bug: not rejected */ 694 + NULL, 718 695 "\\uFFFD\\uFFFD", 719 696 }, 720 697 { 721 698 /* \U+DBFF\U+DC00 */ 722 699 "\xED\xAF\xBF\xED\xB0\x80", 723 - "\xED\xAF\xBF\xED\xB0\x80", /* bug: not rejected */ 700 + NULL, 724 701 "\\uFFFD\\uFFFD", 725 702 }, 726 703 { 727 704 /* \U+DBFF\U+DFFF */ 728 705 "\xED\xAF\xBF\xED\xBF\xBF", 729 - "\xED\xAF\xBF\xED\xBF\xBF", /* bug: not rejected */ 706 + NULL, 730 707 "\\uFFFD\\uFFFD", 731 708 }, 732 709 /* 5.3 Other illegal code positions */ ··· 734 711 { 735 712 /* \U+FFFE */ 736 713 "\xEF\xBF\xBE", 737 - "\xEF\xBF\xBE", /* bug: not rejected */ 714 + NULL, 738 715 "\\uFFFD", 739 716 }, 740 717 { 741 718 /* \U+FFFF */ 742 719 "\xEF\xBF\xBF", 743 - "\xEF\xBF\xBF", /* bug: not rejected */ 720 + NULL, 744 721 "\\uFFFD", 745 722 }, 746 723 { 747 724 /* U+FDD0 */ 748 725 "\xEF\xB7\x90", 749 - "\xEF\xB7\x90", /* bug: not rejected */ 726 + NULL, 750 727 "\\uFFFD", 751 728 }, 752 729 { 753 730 /* U+FDEF */ 754 731 "\xEF\xB7\xAF", 755 - "\xEF\xB7\xAF", /* bug: not rejected */ 732 + NULL, 756 733 "\\uFFFD", 757 734 }, 758 735 /* Plane 1 .. 16 noncharacters */ ··· 774 751 "\xF3\xAF\xBF\xBE\xF3\xAF\xBF\xBF" 775 752 "\xF3\xBF\xBF\xBE\xF3\xBF\xBF\xBF" 776 753 "\xF4\x8F\xBF\xBE\xF4\x8F\xBF\xBF", 777 - /* bug: not rejected */ 778 - "\xF0\x9F\xBF\xBE\xF0\x9F\xBF\xBF" 779 - "\xF0\xAF\xBF\xBE\xF0\xAF\xBF\xBF" 780 - "\xF0\xBF\xBF\xBE\xF0\xBF\xBF\xBF" 781 - "\xF1\x8F\xBF\xBE\xF1\x8F\xBF\xBF" 782 - "\xF1\x9F\xBF\xBE\xF1\x9F\xBF\xBF" 783 - "\xF1\xAF\xBF\xBE\xF1\xAF\xBF\xBF" 784 - "\xF1\xBF\xBF\xBE\xF1\xBF\xBF\xBF" 785 - "\xF2\x8F\xBF\xBE\xF2\x8F\xBF\xBF" 786 - "\xF2\x9F\xBF\xBE\xF2\x9F\xBF\xBF" 787 - "\xF2\xAF\xBF\xBE\xF2\xAF\xBF\xBF" 788 - "\xF2\xBF\xBF\xBE\xF2\xBF\xBF\xBF" 789 - "\xF3\x8F\xBF\xBE\xF3\x8F\xBF\xBF" 790 - "\xF3\x9F\xBF\xBE\xF3\x9F\xBF\xBF" 791 - "\xF3\xAF\xBF\xBE\xF3\xAF\xBF\xBF" 792 - "\xF3\xBF\xBF\xBE\xF3\xBF\xBF\xBF" 793 - "\xF4\x8F\xBF\xBE\xF4\x8F\xBF\xBF", 754 + NULL, 794 755 "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD" 795 756 "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD" 796 757 "\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD\\uFFFD" ··· 829 790 } 830 791 in = strndup(tail, end - tail); 831 792 str = from_json_str(in, j, NULL); 832 - /* 833 - * FIXME JSON parser accepts invalid sequence 834 - * starting with \xC2..\xF4 835 - */ 836 - if (*in >= '\xC2' && *in <= '\xF4') { 837 - g_free(str); 838 - str = NULL; 839 - } 840 793 g_assert(!str); 841 794 g_free(in); 842 795 }

+62 -7

util/unicode.c

··· 13 13 #include "qemu/osdep.h" 14 14 #include "qemu/unicode.h" 15 15 16 + static bool is_valid_codepoint(int codepoint) 17 + { 18 + if (codepoint > 0x10FFFFu) { 19 + return false; /* beyond Unicode range */ 20 + } 21 + if ((codepoint >= 0xFDD0 && codepoint <= 0xFDEF) 22 + || (codepoint & 0xFFFE) == 0xFFFE) { 23 + return false; /* noncharacter */ 24 + } 25 + if (codepoint >= 0xD800 && codepoint <= 0xDFFF) { 26 + return false; /* surrogate code point */ 27 + } 28 + return true; 29 + } 30 + 16 31 /** 17 32 * mod_utf8_codepoint: 18 33 * @s: string encoded in modified UTF-8 ··· 83 98 cp <<= 6; 84 99 cp |= byte & 0x3F; 85 100 } 86 - if (cp > 0x10FFFF) { 87 - cp = -1; /* beyond Unicode range */ 88 - } else if ((cp >= 0xFDD0 && cp <= 0xFDEF) 89 - || (cp & 0xFFFE) == 0xFFFE) { 90 - cp = -1; /* noncharacter */ 91 - } else if (cp >= 0xD800 && cp <= 0xDFFF) { 92 - cp = -1; /* surrogate code point */ 101 + if (!is_valid_codepoint(cp)) { 102 + cp = -1; 93 103 } else if (cp < min_cp[len - 2] && !(cp == 0 && len == 2)) { 94 104 cp = -1; /* overlong, not \xC0\x80 */ 95 105 } ··· 99 109 *end = (char *)p; 100 110 return cp; 101 111 } 112 + 113 + /** 114 + * mod_utf8_encode: 115 + * @buf: Destination buffer 116 + * @bufsz: size of @buf, at least 5. 117 + * @codepoint: Unicode codepoint to encode 118 + * 119 + * Convert Unicode codepoint @codepoint to modified UTF-8. 120 + * 121 + * Returns: the length of the UTF-8 sequence on success, -1 when 122 + * @codepoint is invalid. 123 + */ 124 + ssize_t mod_utf8_encode(char buf[], size_t bufsz, int codepoint) 125 + { 126 + assert(bufsz >= 5); 127 + 128 + if (!is_valid_codepoint(codepoint)) { 129 + return -1; 130 + } 131 + 132 + if (codepoint > 0 && codepoint <= 0x7F) { 133 + buf[0] = codepoint & 0x7F; 134 + buf[1] = 0; 135 + return 1; 136 + } 137 + if (codepoint <= 0x7FF) { 138 + buf[0] = 0xC0 | ((codepoint >> 6) & 0x1F); 139 + buf[1] = 0x80 | (codepoint & 0x3F); 140 + buf[2] = 0; 141 + return 2; 142 + } 143 + if (codepoint <= 0xFFFF) { 144 + buf[0] = 0xE0 | ((codepoint >> 12) & 0x0F); 145 + buf[1] = 0x80 | ((codepoint >> 6) & 0x3F); 146 + buf[2] = 0x80 | (codepoint & 0x3F); 147 + buf[3] = 0; 148 + return 3; 149 + } 150 + buf[0] = 0xF0 | ((codepoint >> 18) & 0x07); 151 + buf[1] = 0x80 | ((codepoint >> 12) & 0x3F); 152 + buf[2] = 0x80 | ((codepoint >> 6) & 0x3F); 153 + buf[3] = 0x80 | (codepoint & 0x3F); 154 + buf[4] = 0; 155 + return 4; 156 + }