Rust implementation of the CVM algorithm for counting distinct elements in a stream

Initial commit

Initial commit

+171
+1
.gitignore
··· 1 + /target
+32
Cargo.toml
··· 1 + [package] 2 + name = "cvmcount" 3 + description = "use the CVM algorithm to quickly estimate the number of tokens in a stream" 4 + readme = "README.md" 5 + license = "MIT OR Apache-2.0" 6 + repository = "https://github.com/urschrei/cvmcount" 7 + 8 + version = "0.1.0" 9 + edition = "2021" 10 + 11 + [dependencies] 12 + rand = "0.8.5" 13 + regex = "1.10.4" 14 + clap = { version = "4.5.4", features = ["cargo"] } 15 + 16 + [lib] 17 + name = "cvmcount" 18 + path = "src/lib.rs" 19 + doctest = false 20 + 21 + [[bin]] 22 + name = "cvmcount" 23 + path = "src/main.rs" 24 + test = false 25 + 26 + [profile.release] 27 + lto = true 28 + codegen-units = 1 29 + 30 + [profile.bench] 31 + lto = true 32 + codegen-units = 1
+18
README.md
··· 1 + # Rust implementation of the CVM counting algorithm 2 + 3 + This library implements 4 + 5 + Chakraborty, S., Vinodchandran, N. V., & Meel, K. S. (2022). *Distinct Elements in Streams: An Algorithm for the (Text) Book*. 6 pages, 727571 bytes. https://doi.org/10.4230/LIPIcs.ESA.2022.34 6 + 7 + The blog post describing the algorithm is here: https://www.quantamagazine.org/computer-scientists-invent-an-efficient-new-way-to-count-20240516/ 8 + 9 + # CLI Example 10 + `cvmcount file.txt 0.8 0.1 2900` 11 + 12 + The `--help` option is available. 13 + 14 + ## Note 15 + If you're thinking about using this library, you presumably know that it only provides an estimate (within the specified bounds), similar to something like HyperLogLog. You are trading accuracy for speed! 16 + 17 + ## Implementation Details 18 + This library strips punctuation from input tokens using a regex. I assume there is a small performance penalty, but it seems like a small price to pay for increased practicality.
+76
src/lib.rs
··· 1 + //! An implementation of the CVM fast token counting algorithm presented in 2 + //! Chakraborty, S., Vinodchandran, N. V., & Meel, K. S. (2022). *Distinct Elements in Streams: An Algorithm for the (Text) Book*. 6 pages, 727571 bytes. https://doi.org/10.4230/LIPIcs.ESA.2022.34 3 + 4 + use rand::rngs::ThreadRng; 5 + use rand::Rng; 6 + use regex::Regex; 7 + 8 + pub struct CVM { 9 + buf_size: usize, 10 + buf: Vec<String>, 11 + probability: f64, 12 + rng: ThreadRng, 13 + re: Regex, 14 + } 15 + impl CVM { 16 + /// Initialise the algorithm 17 + /// 18 + /// epsilon: how close you want your estimate to be to the true number of distinct elements. 19 + /// A smaller ε means you require a more precise estimate. 20 + /// For example, ε = 0.05 means you want your estimate to be within 5% of the actual value. 21 + /// An epsilon of 0.8 is a good starting point for most applications. 22 + /// 23 + /// delta: The level of certainty that the algorithm's estimate will fall within the desired accuracy range. A higher confidence 24 + /// (e.g., 99.9 %) means you're very sure the estimate will be accurate, while a lower confidence (e.g., 90 %) means there's a 25 + /// higher chance the estimate might be outside the desired range. 26 + /// A delta of 0.1 is a good starting point for most applications. 27 + /// 28 + /// stream_size: this is used to determine buffer size and can be a loose approximation. The closer it is to the stream size, 29 + /// the more accurate the results 30 + pub fn new(epsilon: f64, delta: f64, stream_size: usize) -> Self { 31 + let bufsize = buffer_size(epsilon, delta, stream_size); 32 + Self { 33 + buf_size: bufsize as usize, 34 + buf: Vec::with_capacity(bufsize as usize), 35 + probability: 1.0, 36 + rng: rand::thread_rng(), 37 + re: Regex::new(r"[^\w\s]").unwrap(), 38 + } 39 + } 40 + /// Count tokens, given a string containing words, e.g. a line of a book 41 + pub fn process_line_tokens(&mut self, line: String) { 42 + let words = line.split(' '); 43 + for word in words { 44 + let clean_word = self.re.replace_all(word, "").to_lowercase(); 45 + // binary search should be pretty fast 46 + // I think this will be faster than a hashset for practical sizes 47 + // but I need some empirical data for this 48 + if let Some(pos) = self.buf.iter().position(|x| *x == clean_word) { 49 + self.buf.swap_remove(pos); 50 + } 51 + if self.rng.gen_bool(self.probability) { 52 + self.buf.push(clean_word); 53 + } 54 + if self.buf.len() == self.buf_size { 55 + self.clear_about_half(); 56 + self.probability /= 2.0; 57 + if self.buf.len() == self.buf_size { 58 + panic!("Something has gone proper wrong") 59 + } 60 + } 61 + } 62 + } 63 + // remove around half of the elements at random 64 + fn clear_about_half(&mut self) { 65 + self.buf.retain(|_| self.rng.gen_bool(0.5)); 66 + } 67 + /// Calculate the final token count 68 + pub fn calculate_final_result(&self) -> f64 { 69 + self.buf.len() as f64 / self.probability 70 + } 71 + } 72 + 73 + // Calculate threshold (buf_size) value for the F0-Estimator algorithm 74 + fn buffer_size(epsilon: f64, delta: f64, stream_size: usize) -> u32 { 75 + ((12.0 / epsilon.powf(2.0)) * ((8.0 * stream_size as f64) / delta).log2()).ceil() as u32 76 + }
+44
src/main.rs
··· 1 + use clap::{arg, crate_version, value_parser, Command}; 2 + use std::io::BufRead; 3 + use std::path::PathBuf; 4 + use std::fs::File; 5 + use std::io::BufReader; 6 + use std::path::Path; 7 + 8 + use cvmcount::CVM; 9 + 10 + pub fn open_file<P>(filename: P) -> BufReader<File> 11 + where 12 + P: AsRef<Path>, 13 + { 14 + // no need to use a BufReader since we want the entire file 15 + let f = File::open(filename).expect("Couldn't read from file"); 16 + BufReader::new(f) 17 + } 18 + 19 + fn main() { 20 + // Generate a CLI, and get input filename to process 21 + let params = Command::new("CVM") 22 + .version(crate_version!()) 23 + .author("Stephan Hügel <urschrei@gmail.com>") 24 + .about("Use the CVM algorithm to estimate the number of unique tokens in a stream") 25 + .arg(arg!(-t --tokens <FILE> "A text file containing words").index(1).required(true).value_parser(value_parser!(PathBuf))) 26 + .arg(arg!(-e --epsilon <EPSILON> "How close you want your estimate to be to the true number of distinct tokens. A smaller ε means you require a more precise estimate. For example, ε = 0.05 means you want your estimate to be within 5 % of the actual value. An epsilon of 0.8 is a good starting point for most applications").index(2).required(true).value_parser(value_parser!(f64))) 27 + .arg(arg!(-d --delta <DELTA> "The level of certainty that the algorithm's estimate will fall within the desired accuracy range. A higher confidence (e.g., 99.9 %) means you're very sure the estimate will be accurate, while a lower confidence (e.g., 90 %) means there's a higher chance the estimate might be outside the desired range. A delta of 0.1 is a good starting point for most applications").index(3).required(true).value_parser(value_parser!(f64))) 28 + .arg(arg!(-s --streamsize <STREAM_SIZE> "This is used to determine buffer size and can be a loose approximation. The closer it is to the stream size, the more accurate the results").index(4).required(true).value_parser(value_parser!(usize))) 29 + .get_matches(); 30 + let input_file = params.get_one::<PathBuf>("tokens").unwrap(); 31 + let epsilon = params.get_one::<f64>("epsilon").unwrap(); 32 + let delta = params.get_one::<f64>("delta").unwrap(); 33 + let stream_size = params.get_one::<usize>("streamsize").unwrap(); 34 + 35 + let mut counter = CVM::new(*epsilon, *delta, *stream_size); 36 + let br = open_file(input_file); 37 + for line in br.lines() { 38 + counter.process_line_tokens(line.unwrap()) 39 + } 40 + println!( 41 + "Unique tokens: {:?}", 42 + counter.calculate_final_result() as i32 43 + ); 44 + }