Python bindings to oxyroot. Makes reading .root files blazing fast ๐Ÿš€

arrays to return a polars dataframe

+81 -8
+4 -2
Cargo.toml
··· 10 10 11 11 [dependencies] 12 12 oxyroot = "0.1.25" 13 - pyo3 = { version = "0.26.0", features = ["abi3-py38"] } 13 + pyo3 = { version = "0.25.0", features = ["abi3-py38"] } 14 14 parking_lot = "0.12.3" 15 - numpy = "0.26.0" 15 + numpy = "0.25.0" 16 16 parquet = { version = "53.0.0", features = ["arrow"] } 17 17 arrow = "53.0.0" 18 + polars = "0.50.0" 19 + pyo3-polars = "0.23.1"
+8 -6
README.md
··· 8 8 9 9 A fast, Rust-powered Python reader for CERN ROOT files. 10 10 11 - This package provides a simple and Pythonic interface bindings to `oxyroot`, a rust package, to read data from `.root` files, inspired by libraries like `uproot`. It leverages the speed of Rust for high-performance data extraction and integrates with the scientific Python ecosystem by providing data as NumPy arrays. 11 + This python package provides simple bindings to [`oxyroot`, a rust package](https://github.com/m-dupont/oxyroot), to read data from `.root` files, inspired by libraries like `uproot`. It leverages the speed of Rust and integrates with the scientific Python ecosystem by providing data as NumPy arrays or polars dataframes. 12 12 13 13 ## Features 14 14 15 - - **High-Performance**: Core logic is written in Rust for maximum speed. 16 - - **Parquet Conversion**: Convert TTrees directly to Apache Parquet files with a single command. 17 - - **NumPy Integration**: Get branch data directly as NumPy arrays. 18 - - **Simple, Pythonic API**: Easy to learn and use, and similar to `uproot` 15 + - Simple API similar to `uproot` 16 + - Core logic is written in Rust. 17 + - Get branch data directly as NumPy arrays or Polars dataframe. 18 + - Parquet Conversion: Convert TTrees directly to Apache Parquet files with a single command. 19 19 20 20 ## Quick Start 21 21 ··· 60 60 61 61 ## Performance 62 62 63 - `oxyroot` is designed to be fast. Here is a simple benchmark comparing the time taken to read all branches of a TTree with `uproot` and `oxyroot`. 63 + `oxyroot` is intended to be fast. Here is a simple benchmark comparing the time taken to read all branches of a TTree with `uproot` and `oxyroot`. 64 64 65 65 ```python 66 66 import oxyroot ··· 87 87 end_time = time.time() 88 88 print(f"Oxyroot took: {end_time - start_time:.3f}s") 89 89 ``` 90 + 91 + On a small file (~20 MB) with multiple data formats, oxyroot took half the time of uproot, and also read in the branch with strings! 90 92 91 93 ## License 92 94
+69
src/lib.rs
··· 14 14 use parquet::arrow::ArrowWriter; 15 15 use parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel}; 16 16 use parquet::file::properties::WriterProperties; 17 + use polars::prelude::*; 18 + use pyo3_polars::PyDataFrame; 17 19 18 20 #[pyclass(name = "RootFile")] 19 21 struct PyRootFile { ··· 92 94 branches: branches.into_iter(), 93 95 }, 94 96 ) 97 + } 98 + 99 + #[pyo3(signature = (columns = None))] 100 + fn arrays(&self, columns: Option<Vec<String>>) -> PyResult<PyDataFrame> { 101 + let mut file = 102 + RootFile::open(&self.path).map_err(|e| PyValueError::new_err(e.to_string()))?; 103 + let tree = file 104 + .get_tree(&self.name) 105 + .map_err(|e| PyValueError::new_err(e.to_string()))?; 106 + 107 + let branches_to_save = if let Some(columns) = columns { 108 + columns 109 + } else { 110 + tree.branches().map(|b| b.name().to_string()).collect() 111 + }; 112 + 113 + let mut series_vec = Vec::new(); 114 + 115 + for branch_name in branches_to_save { 116 + let branch = match tree.branch(&branch_name) { 117 + Some(branch) => branch, 118 + None => { 119 + println!("Branch '{}' not found, skipping", branch_name); 120 + continue; 121 + } 122 + }; 123 + 124 + let series = match branch.item_type_name().as_str() { 125 + "float" => { 126 + let data = branch.as_iter::<f32>().unwrap().collect::<Vec<_>>(); 127 + Series::new((&branch_name).into(), data) 128 + } 129 + "double" => { 130 + let data = branch.as_iter::<f64>().unwrap().collect::<Vec<_>>(); 131 + Series::new((&branch_name).into(), data) 132 + } 133 + "int32_t" => { 134 + let data = branch.as_iter::<i32>().unwrap().collect::<Vec<_>>(); 135 + Series::new((&branch_name).into(), data) 136 + } 137 + "int64_t" => { 138 + let data = branch.as_iter::<i64>().unwrap().collect::<Vec<_>>(); 139 + Series::new((&branch_name).into(), data) 140 + } 141 + "uint32_t" => { 142 + let data = branch.as_iter::<u32>().unwrap().collect::<Vec<_>>(); 143 + Series::new((&branch_name).into(), data) 144 + } 145 + "uint64_t" => { 146 + let data = branch.as_iter::<u64>().unwrap().collect::<Vec<_>>(); 147 + Series::new((&branch_name).into(), data) 148 + } 149 + "string" => { 150 + let data = branch.as_iter::<String>().unwrap().collect::<Vec<_>>(); 151 + Series::new((&branch_name).into(), data) 152 + } 153 + other => { 154 + println!("Unsupported branch type: {}, skipping", other); 155 + continue; 156 + } 157 + }; 158 + series_vec.push(series); 159 + } 160 + 161 + let df = DataFrame::new(series_vec.into_iter().map(|s| s.into()).collect()) 162 + .map_err(|e| PyValueError::new_err(e.to_string()))?; 163 + Ok(PyDataFrame(df)) 95 164 } 96 165 97 166 #[pyo3(signature = (output_file, overwrite = false, compression = "snappy", columns = None))]