···11+[package]
22+name = "cvmcount"
33+description = "use the CVM algorithm to quickly estimate the number of tokens in a stream"
44+readme = "README.md"
55+license = "MIT OR Apache-2.0"
66+repository = "https://github.com/urschrei/cvmcount"
77+88+version = "0.1.0"
99+edition = "2021"
1010+1111+[dependencies]
1212+rand = "0.8.5"
1313+regex = "1.10.4"
1414+clap = { version = "4.5.4", features = ["cargo"] }
1515+1616+[lib]
1717+name = "cvmcount"
1818+path = "src/lib.rs"
1919+doctest = false
2020+2121+[[bin]]
2222+name = "cvmcount"
2323+path = "src/main.rs"
2424+test = false
2525+2626+[profile.release]
2727+lto = true
2828+codegen-units = 1
2929+3030+[profile.bench]
3131+lto = true
3232+codegen-units = 1
+18
README.md
···11+# Rust implementation of the CVM counting algorithm
22+33+This library implements
44+55+Chakraborty, S., Vinodchandran, N. V., & Meel, K. S. (2022). *Distinct Elements in Streams: An Algorithm for the (Text) Book*. 6 pages, 727571 bytes. https://doi.org/10.4230/LIPIcs.ESA.2022.34
66+77+The blog post describing the algorithm is here: https://www.quantamagazine.org/computer-scientists-invent-an-efficient-new-way-to-count-20240516/
88+99+# CLI Example
1010+`cvmcount file.txt 0.8 0.1 2900`
1111+1212+The `--help` option is available.
1313+1414+## Note
1515+If you're thinking about using this library, you presumably know that it only provides an estimate (within the specified bounds), similar to something like HyperLogLog. You are trading accuracy for speed!
1616+1717+## Implementation Details
1818+This library strips punctuation from input tokens using a regex. I assume there is a small performance penalty, but it seems like a small price to pay for increased practicality.
+76
src/lib.rs
···11+//! An implementation of the CVM fast token counting algorithm presented in
22+//! Chakraborty, S., Vinodchandran, N. V., & Meel, K. S. (2022). *Distinct Elements in Streams: An Algorithm for the (Text) Book*. 6 pages, 727571 bytes. https://doi.org/10.4230/LIPIcs.ESA.2022.34
33+44+use rand::rngs::ThreadRng;
55+use rand::Rng;
66+use regex::Regex;
77+88+pub struct CVM {
99+ buf_size: usize,
1010+ buf: Vec<String>,
1111+ probability: f64,
1212+ rng: ThreadRng,
1313+ re: Regex,
1414+}
1515+impl CVM {
1616+ /// Initialise the algorithm
1717+ ///
1818+ /// epsilon: how close you want your estimate to be to the true number of distinct elements.
1919+ /// A smaller ε means you require a more precise estimate.
2020+ /// For example, ε = 0.05 means you want your estimate to be within 5% of the actual value.
2121+ /// An epsilon of 0.8 is a good starting point for most applications.
2222+ ///
2323+ /// delta: The level of certainty that the algorithm's estimate will fall within the desired accuracy range. A higher confidence
2424+ /// (e.g., 99.9 %) means you're very sure the estimate will be accurate, while a lower confidence (e.g., 90 %) means there's a
2525+ /// higher chance the estimate might be outside the desired range.
2626+ /// A delta of 0.1 is a good starting point for most applications.
2727+ ///
2828+ /// stream_size: this is used to determine buffer size and can be a loose approximation. The closer it is to the stream size,
2929+ /// the more accurate the results
3030+ pub fn new(epsilon: f64, delta: f64, stream_size: usize) -> Self {
3131+ let bufsize = buffer_size(epsilon, delta, stream_size);
3232+ Self {
3333+ buf_size: bufsize as usize,
3434+ buf: Vec::with_capacity(bufsize as usize),
3535+ probability: 1.0,
3636+ rng: rand::thread_rng(),
3737+ re: Regex::new(r"[^\w\s]").unwrap(),
3838+ }
3939+ }
4040+ /// Count tokens, given a string containing words, e.g. a line of a book
4141+ pub fn process_line_tokens(&mut self, line: String) {
4242+ let words = line.split(' ');
4343+ for word in words {
4444+ let clean_word = self.re.replace_all(word, "").to_lowercase();
4545+ // binary search should be pretty fast
4646+ // I think this will be faster than a hashset for practical sizes
4747+ // but I need some empirical data for this
4848+ if let Some(pos) = self.buf.iter().position(|x| *x == clean_word) {
4949+ self.buf.swap_remove(pos);
5050+ }
5151+ if self.rng.gen_bool(self.probability) {
5252+ self.buf.push(clean_word);
5353+ }
5454+ if self.buf.len() == self.buf_size {
5555+ self.clear_about_half();
5656+ self.probability /= 2.0;
5757+ if self.buf.len() == self.buf_size {
5858+ panic!("Something has gone proper wrong")
5959+ }
6060+ }
6161+ }
6262+ }
6363+ // remove around half of the elements at random
6464+ fn clear_about_half(&mut self) {
6565+ self.buf.retain(|_| self.rng.gen_bool(0.5));
6666+ }
6767+ /// Calculate the final token count
6868+ pub fn calculate_final_result(&self) -> f64 {
6969+ self.buf.len() as f64 / self.probability
7070+ }
7171+}
7272+7373+// Calculate threshold (buf_size) value for the F0-Estimator algorithm
7474+fn buffer_size(epsilon: f64, delta: f64, stream_size: usize) -> u32 {
7575+ ((12.0 / epsilon.powf(2.0)) * ((8.0 * stream_size as f64) / delta).log2()).ceil() as u32
7676+}
+44
src/main.rs
···11+use clap::{arg, crate_version, value_parser, Command};
22+use std::io::BufRead;
33+use std::path::PathBuf;
44+use std::fs::File;
55+use std::io::BufReader;
66+use std::path::Path;
77+88+use cvmcount::CVM;
99+1010+pub fn open_file<P>(filename: P) -> BufReader<File>
1111+where
1212+ P: AsRef<Path>,
1313+{
1414+ // no need to use a BufReader since we want the entire file
1515+ let f = File::open(filename).expect("Couldn't read from file");
1616+ BufReader::new(f)
1717+}
1818+1919+fn main() {
2020+ // Generate a CLI, and get input filename to process
2121+ let params = Command::new("CVM")
2222+ .version(crate_version!())
2323+ .author("Stephan Hügel <urschrei@gmail.com>")
2424+ .about("Use the CVM algorithm to estimate the number of unique tokens in a stream")
2525+ .arg(arg!(-t --tokens <FILE> "A text file containing words").index(1).required(true).value_parser(value_parser!(PathBuf)))
2626+ .arg(arg!(-e --epsilon <EPSILON> "How close you want your estimate to be to the true number of distinct tokens. A smaller ε means you require a more precise estimate. For example, ε = 0.05 means you want your estimate to be within 5 % of the actual value. An epsilon of 0.8 is a good starting point for most applications").index(2).required(true).value_parser(value_parser!(f64)))
2727+ .arg(arg!(-d --delta <DELTA> "The level of certainty that the algorithm's estimate will fall within the desired accuracy range. A higher confidence (e.g., 99.9 %) means you're very sure the estimate will be accurate, while a lower confidence (e.g., 90 %) means there's a higher chance the estimate might be outside the desired range. A delta of 0.1 is a good starting point for most applications").index(3).required(true).value_parser(value_parser!(f64)))
2828+ .arg(arg!(-s --streamsize <STREAM_SIZE> "This is used to determine buffer size and can be a loose approximation. The closer it is to the stream size, the more accurate the results").index(4).required(true).value_parser(value_parser!(usize)))
2929+ .get_matches();
3030+ let input_file = params.get_one::<PathBuf>("tokens").unwrap();
3131+ let epsilon = params.get_one::<f64>("epsilon").unwrap();
3232+ let delta = params.get_one::<f64>("delta").unwrap();
3333+ let stream_size = params.get_one::<usize>("streamsize").unwrap();
3434+3535+ let mut counter = CVM::new(*epsilon, *delta, *stream_size);
3636+ let br = open_file(input_file);
3737+ for line in br.lines() {
3838+ counter.process_line_tokens(line.unwrap())
3939+ }
4040+ println!(
4141+ "Unique tokens: {:?}",
4242+ counter.calculate_final_result() as i32
4343+ );
4444+}