···58)
59```
600000000000000000000061## Performance
6263`oxyroot` is intended to be fast. Here is a simple benchmark comparing the time taken to read all branches of a TTree with `uproot` and `oxyroot`.
···58)
59```
6061+## Combining Multiple Files
62+63+You can efficiently read and concatenate a TTree from multiple ROOT files into a single Polars DataFrame using `concat_trees`. This function processes files in parallel to maximize performance.
64+65+```python
66+import oxyroot
67+68+# Combine trees from multiple files using a wildcard
69+df = oxyroot.concat_trees(paths=["ntuples*.root"], tree_name="mu_mc")
70+71+print(df)
72+73+# You can also provide a list of specific files
74+# df = oxyroot.concat_trees(paths=["file1.root", "file2.root"], tree_name="my_tree")
75+76+# Control the number of threads used for parallel processing
77+# By default, it uses half the available CPU cores.
78+oxyroot.set_num_threads(4)
79+```
80+81## Performance
8283`oxyroot` is intended to be fast. Here is a simple benchmark comparing the time taken to read all branches of a TTree with `uproot` and `oxyroot`.
···1from typing import Iterator, List, Optional
2import numpy as np
3+import polars as pl
45class RootFile:
6 path: str
···14 def branches(self) -> List[str]: ...
15 def __getitem__(self, name: str) -> Branch: ...
16 def __iter__(self) -> Iterator[Branch]: ...
17+ def arrays(self, columns:Optional[List[str]] = None, ignore_columns: Optional[List[str]] = None) -> pl.DataFrame ...
18 def to_parquet(self, output_file: str, overwrite: bool = False, compression: str = "snappy", columns: Optional[List[str]] = None) -> None: ...
1920class Branch:
···42 A RootFile object.
43 """
44 ...
45+46+def concat_trees(
47+ paths: List[str],
48+ tree_name: str,
49+ columns: Optional[List[str]] = None,
50+ ignore_columns: Optional[List[str]] = None,
51+) -> pl.DataFrame:
52+ """
53+ Reads multiple ROOT files, concatenates the specified tree, and returns a single Polars DataFrame.
54+55+ Args:
56+ paths: A list of paths to the ROOT files. Wildcards are supported.
57+ tree_name: The name of the tree to read from each file.
58+ columns: An optional list of column names to include. If None, all columns are included.
59+ ignore_columns: An optional list of column names to exclude.
60+61+ Returns:
62+ A single Polars DataFrame containing the concatenated data.
63+ """
64+ ...
65+66+def set_num_threads(num_threads: int) -> None:
67+ """
68+ Sets the number of threads to use for parallel operations.
69+70+ Args:
71+ num_threads: The number of threads to use.
72+ """
73+ ...
+144-59
src/lib.rs
···11};
12use arrow::datatypes::{DataType, Field, Schema};
13use arrow::record_batch::RecordBatch;
0014use parquet::arrow::ArrowWriter;
15use parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel};
16use parquet::file::properties::WriterProperties;
17use polars::prelude::*;
018use pyo3_polars::PyDataFrame;
000000000000000000001920#[pyclass(name = "RootFile")]
21struct PyRootFile {
···41 name: String,
42}
43000000000000000000000000000000000000000000000000000000000000000000044#[pymethods]
45impl PyRootFile {
46 #[new]
···96 )
97 }
9899- #[pyo3(signature = (columns = None))]
100- fn arrays(&self, columns: Option<Vec<String>>) -> PyResult<PyDataFrame> {
0000101 let mut file =
102 RootFile::open(&self.path).map_err(|e| PyValueError::new_err(e.to_string()))?;
103 let tree = file
104 .get_tree(&self.name)
105 .map_err(|e| PyValueError::new_err(e.to_string()))?;
106-107- let branches_to_save = if let Some(columns) = columns {
108- columns
109- } else {
110- tree.branches().map(|b| b.name().to_string()).collect()
111- };
112-113- let mut series_vec = Vec::new();
114-115- for branch_name in branches_to_save {
116- let branch = match tree.branch(&branch_name) {
117- Some(branch) => branch,
118- None => {
119- println!("Branch '{}' not found, skipping", branch_name);
120- continue;
121- }
122- };
123-124- let series = match branch.item_type_name().as_str() {
125- "float" => {
126- let data = branch.as_iter::<f32>().unwrap().collect::<Vec<_>>();
127- Series::new((&branch_name).into(), data)
128- }
129- "double" => {
130- let data = branch.as_iter::<f64>().unwrap().collect::<Vec<_>>();
131- Series::new((&branch_name).into(), data)
132- }
133- "int32_t" => {
134- let data = branch.as_iter::<i32>().unwrap().collect::<Vec<_>>();
135- Series::new((&branch_name).into(), data)
136- }
137- "int64_t" => {
138- let data = branch.as_iter::<i64>().unwrap().collect::<Vec<_>>();
139- Series::new((&branch_name).into(), data)
140- }
141- "uint32_t" => {
142- let data = branch.as_iter::<u32>().unwrap().collect::<Vec<_>>();
143- Series::new((&branch_name).into(), data)
144- }
145- "uint64_t" => {
146- let data = branch.as_iter::<u64>().unwrap().collect::<Vec<_>>();
147- Series::new((&branch_name).into(), data)
148- }
149- "string" => {
150- let data = branch.as_iter::<String>().unwrap().collect::<Vec<_>>();
151- Series::new((&branch_name).into(), data)
152- }
153- other => {
154- println!("Unsupported branch type: {}, skipping", other);
155- continue;
156- }
157- };
158- series_vec.push(series);
159- }
160-161- let df = DataFrame::new(series_vec.into_iter().map(|s| s.into()).collect())
162- .map_err(|e| PyValueError::new_err(e.to_string()))?;
163 Ok(PyDataFrame(df))
164 }
165···390 Ok(env!("CARGO_PKG_VERSION").to_string())
391}
392000000000000000000000000000000000000000000000393/// A Python module to read root files, implemented in Rust.
394#[pymodule]
395fn oxyroot(m: &Bound<'_, PyModule>) -> PyResult<()> {
396 m.add_function(wrap_pyfunction!(version, m)?)?;
397 m.add_function(wrap_pyfunction!(open, m)?)?;
00398 m.add_class::<PyRootFile>()?;
399 m.add_class::<PyTree>()?;
400 m.add_class::<PyBranch>()?;