···11//! [`TokenWeights`] are representations of how common [`Token`]s are, and are paired up with
22//! a [`TokenPair`](crate::token::TokenPair) in a [`NailKov`](crate::NailKov).
3344+use core::hash::BuildHasherDefault;
55+66+use estr::IdentityHasher;
47use indexmap::IndexMap;
58use rand::Rng;
69use rand_distr::{Distribution, weighted::WeightedAliasIndex};
71088-use crate::{RandomState, error::NailError, token::Token};
1111+use crate::{error::NailError, token::Token};
9121013/// A distribution of choices and their likelihood.
1114#[derive(Clone, Debug)]
···2932#[derive(Clone, Debug)]
3033pub struct TokenWeightsBuilder {
3134 /// Counts how many times a token is likely to appear.
3232- occurrences: IndexMap<Token, u32, RandomState>,
3535+ occurrences: IndexMap<Token, u32, BuildHasherDefault<IdentityHasher>>,
3336}
34373538impl TokenWeightsBuilder {
3636- pub fn new(hasher: RandomState) -> Self {
3939+ pub fn new() -> Self {
3740 Self {
3838- occurrences: IndexMap::with_hasher(hasher),
4141+ occurrences: IndexMap::with_hasher(Default::default()),
3942 }
4043 }
4144···64676568impl Default for TokenWeightsBuilder {
6669 fn default() -> Self {
6767- Self::new(RandomState::new())
7070+ Self::new()
6871 }
6972}
-237
crates/nailkov/src/interner.rs
···11-use hashbrown::{Equivalent, HashMap};
22-use rapidhash::fast::RandomState;
33-44-use crate::token::Token;
55-66-#[derive(Debug, Clone, Copy, PartialEq, Eq)]
77-#[repr(transparent)]
88-struct StringPtr(*const str);
99-1010-impl StringPtr {
1111- #[inline(always)]
1212- const fn cast(&self) -> &str {
1313- // SAFETY: The pointer is stable as it points to memory that is never
1414- // moved/invalidated while this struct lives, therefore can be safely
1515- // dereferenced back to a string slice. We own the String instance this
1616- // references, and all StringPtrs are used within the same scope as the
1717- // String instances, so when String drops, these will be dropped too.
1818- unsafe { &*self.0 }
1919- }
2020-}
2121-2222-impl core::hash::Hash for StringPtr {
2323- #[inline]
2424- fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
2525- self.cast().hash(state);
2626- }
2727-}
2828-// SAFETY: StringPtr contains a ptr to the heap, that is never moved or invalidated
2929-// while Interner lives, and all instances of StringPtr live as long as Interner.
3030-// Since the String type is `Send`, so is StringPtr
3131-unsafe impl Send for StringPtr {}
3232-// SAFETY: StringPtr contains a ptr to the heap, that is never moved or invalidated
3333-// while Interner lives, and all instances of StringPtr live as long as Interner.
3434-// Since the String type is `Sync`, so is StringPtr
3535-unsafe impl Sync for StringPtr {}
3636-3737-#[derive(Debug, Clone)]
3838-pub struct Interner {
3939- collected: HashMap<StringPtr, Token, RandomState>,
4040- index: Vec<StringPtr>,
4141- buffer: String,
4242- stored: Vec<String>,
4343-}
4444-4545-impl Default for Interner {
4646- fn default() -> Self {
4747- Self::with_capacity(256)
4848- }
4949-}
5050-5151-impl Interner {
5252- /// # Safety
5353- /// The caller must ensure that the [`Token`] being passed in was allocated
5454- /// from the same [`Interner`] instance.
5555- #[inline(always)]
5656- pub unsafe fn lookup(&self, id: Token) -> &str {
5757- // SAFETY: Safety is upheld by the caller ensuring the id was allocated
5858- // from the same interner.
5959- unsafe { self.index.get_unchecked(id.index()).cast() }
6060- }
6161-6262- pub fn with_capacity(cap: usize) -> Interner {
6363- // This will get us just under 64KiB of interned storage before we
6464- // need to allocate more space for buffer storage.
6565- let stored = Vec::with_capacity(8);
6666-6767- Interner {
6868- collected: HashMap::with_hasher(RandomState::new()),
6969- index: Vec::new(),
7070- stored,
7171- buffer: String::with_capacity(cap.next_power_of_two()),
7272- }
7373- }
7474-7575- pub fn intern(&mut self, text: &str) -> Token {
7676- if let Some(&id) = self.collected.get(text) {
7777- return id;
7878- }
7979-8080- // SAFETY: `alloc`` is never called elsewhere, nor the properties it controls
8181- // are modified outside of the method. Here we get a new StringPtr for `text` that
8282- // hasn't been stored before.
8383- let name = unsafe { self.alloc(text) };
8484- let id = Token::new(self.index.len() as u32);
8585- self.collected.insert(name, id);
8686- self.index.push(name);
8787-8888- // SAFETY: We are using the id allocated within the same function scope,
8989- // so it is always from the same source.
9090- unsafe {
9191- debug_assert!(self.lookup(id).equivalent(&name));
9292- }
9393- debug_assert!(self.intern(name.cast()) == id);
9494-9595- id
9696- }
9797-9898- /// Allocates a new [`StringPtr`] for the given string input. If there is no more room
9999- /// in the current buffer, it allocates a new buffer and creates the StringPtr to reference
100100- /// the stored string in the new buffer, storing the old one.
101101- ///
102102- /// # Safety
103103- ///
104104- /// The caller must ensure that `self.buffers` and `self.active` are never modified elsewhere,
105105- /// and that this is called only for new instances of `text`.
106106- unsafe fn alloc(&mut self, text: &str) -> StringPtr {
107107- let capacity = self.buffer.capacity();
108108-109109- if capacity < self.buffer.len() + text.len() {
110110- // If we ran out of capacity in our storage, allocate a new buffer with
111111- // larger capacity.
112112- let new_cap = (capacity.max(text.len()) + 1).next_power_of_two();
113113- let old_buf = core::mem::replace(&mut self.buffer, String::with_capacity(new_cap));
114114-115115- self.stored.push(old_buf);
116116- }
117117-118118- // Construct raw str slice to eliminate lifetime tracking as we manage its
119119- // lifetime within the Interner instance.
120120- let interned = {
121121- let start = self.buffer.len();
122122- self.buffer.push_str(text);
123123-124124- &raw const self.buffer[start..]
125125- };
126126-127127- StringPtr(interned)
128128- }
129129-}
130130-131131-impl Equivalent<StringPtr> for str {
132132- #[inline(always)]
133133- fn equivalent(&self, key: &StringPtr) -> bool {
134134- key.cast().eq(self)
135135- }
136136-}
137137-138138-#[cfg(test)]
139139-mod tests {
140140- use super::*;
141141-142142- #[test]
143143- fn string_ptr_comparisons() {
144144- let one = "one";
145145- let two = "two";
146146-147147- let one_ptr = StringPtr(one);
148148- let two_ptr = StringPtr(two);
149149-150150- assert_ne!(one_ptr, two_ptr);
151151-152152- assert!(one.equivalent(&one_ptr));
153153- }
154154-155155- #[test]
156156- fn is_able_to_intern_one_string() {
157157- let mut interner = Interner::default();
158158-159159- assert!(interner.buffer.is_empty());
160160-161161- let text = "Lorem ipsum";
162162-163163- let id = interner.intern(text);
164164-165165- // SAFETY: It comes from the same source
166166- unsafe {
167167- assert_eq!(text, interner.lookup(id));
168168- }
169169- assert_eq!(interner.buffer.len(), 11);
170170-171171- let again = interner.intern(text);
172172-173173- assert_eq!(id, again);
174174- assert_eq!(interner.buffer.len(), 11);
175175- }
176176-177177- #[test]
178178- fn is_able_to_intern_many_strings() {
179179- let mut interner = Interner::with_capacity(32);
180180-181181- let texts = [
182182- "Lorem ipsum",
183183- "dolor sit amet",
184184- "duplicated",
185185- "Other text",
186186- "Elevenses",
187187- "duplicated",
188188- "Gibberish",
189189- ];
190190-191191- let interned: Vec<Token> = texts.iter().map(|&text| interner.intern(text)).collect();
192192-193193- assert_eq!(
194194- interned.as_slice(),
195195- &[
196196- Token::new(0),
197197- Token::new(1),
198198- Token::new(2),
199199- Token::new(3),
200200- Token::new(4),
201201- Token::new(2),
202202- Token::new(5)
203203- ]
204204- );
205205- assert_eq!(interner.buffer.capacity(), 64);
206206- assert_eq!(interner.stored.len(), 1);
207207- assert_eq!(interner.stored[0].capacity(), 32);
208208- }
209209-210210- #[test]
211211- fn is_thread_safe() {
212212- let mut interner = Interner::with_capacity(32);
213213-214214- let texts = [
215215- "Lorem ipsum",
216216- "dolor sit amet",
217217- "duplicated",
218218- "Other text",
219219- "Elevenses",
220220- "duplicated",
221221- "Gibberish",
222222- ];
223223-224224- let interned: Vec<Token> = texts.iter().map(|&text| interner.intern(text)).collect();
225225-226226- std::thread::scope(|s| {
227227- s.spawn(move || {
228228- for (id, expected) in interned.into_iter().zip(texts) {
229229- // SAFETY: It comes from the same source
230230- unsafe {
231231- assert_eq!(expected, interner.lookup(id));
232232- }
233233- }
234234- });
235235- });
236236- }
237237-}