compiler-core/src/strings.rs at wasm · daniellemaywood.uk/gleam

daniellemaywood.uk / gleam
fork atom
this repo has no description
fork atom
gleam / compiler-core / src / strings.rs
at wasm 167 lines 5.6 kB view raw
wrap content
gearsco.de Tweak comment wording 10mo ago
fef83501
  1use ecow::EcoString;
  2use itertools::Itertools;
  3
  4use crate::ast::Endianness;
  5
  6/// Converts any escape sequences from the given string to their correct
  7/// bytewise UTF-8 representation and returns the resulting string.
  8pub fn convert_string_escape_chars(str: &EcoString) -> EcoString {
  9    let mut filtered_str = EcoString::new();
 10    let mut str_iter = str.chars().peekable();
 11    loop {
 12        match str_iter.next() {
 13            Some('\\') => match str_iter.next() {
 14                // Check for Unicode escape sequence, e.g. \u{00012FF}
 15                Some('u') => {
 16                    if str_iter.peek() != Some(&'{') {
 17                        // Invalid Unicode escape sequence
 18                        filtered_str.push('u');
 19                        continue;
 20                    }
 21
 22                    // Consume the left brace after peeking
 23                    let _ = str_iter.next();
 24
 25                    let codepoint_str = str_iter
 26                        .peeking_take_while(char::is_ascii_hexdigit)
 27                        .collect::<String>();
 28
 29                    if codepoint_str.is_empty() || str_iter.peek() != Some(&'}') {
 30                        // Invalid Unicode escape sequence
 31                        filtered_str.push_str("u{");
 32                        filtered_str.push_str(&codepoint_str);
 33                        continue;
 34                    }
 35
 36                    let codepoint = u32::from_str_radix(&codepoint_str, 16)
 37                        .ok()
 38                        .and_then(char::from_u32);
 39
 40                    if let Some(codepoint) = codepoint {
 41                        // Consume the right brace after peeking
 42                        let _ = str_iter.next();
 43
 44                        // Consider this codepoint's length instead of
 45                        // that of the Unicode escape sequence itself
 46                        filtered_str.push(codepoint);
 47                    } else {
 48                        // Invalid Unicode escape sequence
 49                        // (codepoint value not in base 16 or too large)
 50                        filtered_str.push_str("u{");
 51                        filtered_str.push_str(&codepoint_str);
 52                    }
 53                }
 54                Some('n') => filtered_str.push('\n'),
 55                Some('r') => filtered_str.push('\r'),
 56                Some('f') => filtered_str.push('\u{C}'),
 57                Some('t') => filtered_str.push('\t'),
 58                Some('"') => filtered_str.push('\"'),
 59                Some('\\') => filtered_str.push('\\'),
 60                Some(c) => filtered_str.push(c),
 61                None => break,
 62            },
 63            Some(c) => filtered_str.push(c),
 64            None => break,
 65        }
 66    }
 67    filtered_str
 68}
 69
 70pub fn to_snake_case(string: &str) -> EcoString {
 71    let mut snake_case = EcoString::with_capacity(string.len());
 72    let mut is_word_boundary = true;
 73
 74    for char in string.chars() {
 75        match char {
 76            '_' | ' ' => {
 77                is_word_boundary = true;
 78                continue;
 79            }
 80            _ if char.is_uppercase() => {
 81                is_word_boundary = true;
 82            }
 83            _ => {}
 84        }
 85
 86        if is_word_boundary {
 87            // We don't want to push an underscore at the start of the string,
 88            // even if it starts with a capital letter or other delimiter.
 89            if !snake_case.is_empty() {
 90                snake_case.push('_');
 91            }
 92            is_word_boundary = false;
 93        }
 94        snake_case.push(char.to_ascii_lowercase());
 95    }
 96
 97    snake_case
 98}
 99
100pub fn to_upper_camel_case(string: &str) -> EcoString {
101    let mut pascal_case = EcoString::with_capacity(string.len());
102    let mut chars = string.chars();
103
104    while let Some(char) = chars.next() {
105        if char == '_' {
106            let Some(next) = chars.next() else { break };
107            pascal_case.push(next.to_ascii_uppercase());
108        } else {
109            pascal_case.push(char);
110        }
111    }
112
113    pascal_case
114}
115
116/// Converts a string into its UTF-16 representation in bytes
117pub fn string_to_utf16_bytes(string: &str, endianness: Endianness) -> Vec<u8> {
118    let mut bytes = Vec::with_capacity(string.len() * 2);
119
120    let mut character_buffer = [0, 0];
121    for character in string.chars() {
122        let segments = character.encode_utf16(&mut character_buffer);
123
124        for segment in segments {
125            let segment_bytes = match endianness {
126                Endianness::Big => segment.to_be_bytes(),
127                Endianness::Little => segment.to_le_bytes(),
128            };
129
130            bytes.push(segment_bytes[0]);
131            bytes.push(segment_bytes[1]);
132        }
133    }
134
135    bytes
136}
137
138/// Converts a string into its UTF-32 representation in bytes
139pub fn string_to_utf32_bytes(string: &str, endianness: Endianness) -> Vec<u8> {
140    let mut bytes = Vec::with_capacity(string.len() * 4);
141
142    for character in string.chars() {
143        let character_bytes = match endianness {
144            Endianness::Big => (character as u32).to_be_bytes(),
145            Endianness::Little => (character as u32).to_le_bytes(),
146        };
147        bytes.extend(character_bytes);
148    }
149
150    bytes
151}
152
153/// Gets the number of UTF-16 codepoints it would take to encode a given string.
154pub fn length_utf16(string: &str) -> usize {
155    let mut length = 0;
156
157    for char in string.chars() {
158        length += char.len_utf16()
159    }
160
161    length
162}
163
164/// Gets the number of UTF-32 codepoints in a string
165pub fn length_utf32(string: &str) -> usize {
166    string.chars().count()
167}