···5858)
5959```
60606161+## Combining Multiple Files
6262+6363+You can efficiently read and concatenate a TTree from multiple ROOT files into a single Polars DataFrame using `concat_trees`. This function processes files in parallel to maximize performance.
6464+6565+```python
6666+import oxyroot
6767+6868+# Combine trees from multiple files using a wildcard
6969+df = oxyroot.concat_trees(paths=["ntuples*.root"], tree_name="mu_mc")
7070+7171+print(df)
7272+7373+# You can also provide a list of specific files
7474+# df = oxyroot.concat_trees(paths=["file1.root", "file2.root"], tree_name="my_tree")
7575+7676+# Control the number of threads used for parallel processing
7777+# By default, it uses half the available CPU cores.
7878+oxyroot.set_num_threads(4)
7979+```
8080+6181## Performance
62826383`oxyroot` is intended to be fast. Here is a simple benchmark comparing the time taken to read all branches of a TTree with `uproot` and `oxyroot`.
+31
python/oxyroot/__init__.pyi
···11from typing import Iterator, List, Optional
22import numpy as np
33+import polars as pl
3445class RootFile:
56 path: str
···1314 def branches(self) -> List[str]: ...
1415 def __getitem__(self, name: str) -> Branch: ...
1516 def __iter__(self) -> Iterator[Branch]: ...
1717+ def arrays(self, columns:Optional[List[str]] = None, ignore_columns: Optional[List[str]] = None) -> pl.DataFrame ...
1618 def to_parquet(self, output_file: str, overwrite: bool = False, compression: str = "snappy", columns: Optional[List[str]] = None) -> None: ...
17191820class Branch:
···4042 A RootFile object.
4143 """
4244 ...
4545+4646+def concat_trees(
4747+ paths: List[str],
4848+ tree_name: str,
4949+ columns: Optional[List[str]] = None,
5050+ ignore_columns: Optional[List[str]] = None,
5151+) -> pl.DataFrame:
5252+ """
5353+ Reads multiple ROOT files, concatenates the specified tree, and returns a single Polars DataFrame.
5454+5555+ Args:
5656+ paths: A list of paths to the ROOT files. Wildcards are supported.
5757+ tree_name: The name of the tree to read from each file.
5858+ columns: An optional list of column names to include. If None, all columns are included.
5959+ ignore_columns: An optional list of column names to exclude.
6060+6161+ Returns:
6262+ A single Polars DataFrame containing the concatenated data.
6363+ """
6464+ ...
6565+6666+def set_num_threads(num_threads: int) -> None:
6767+ """
6868+ Sets the number of threads to use for parallel operations.
6969+7070+ Args:
7171+ num_threads: The number of threads to use.
7272+ """
7373+ ...
+144-59
src/lib.rs
···1111};
1212use arrow::datatypes::{DataType, Field, Schema};
1313use arrow::record_batch::RecordBatch;
1414+use once_cell::sync::Lazy;
1515+use parking_lot::Mutex;
1416use parquet::arrow::ArrowWriter;
1517use parquet::basic::{BrotliLevel, Compression, GzipLevel, ZstdLevel};
1618use parquet::file::properties::WriterProperties;
1719use polars::prelude::*;
2020+use polars_core::utils::concat_df;
1821use pyo3_polars::PyDataFrame;
2222+use rayon::prelude::*;
2323+2424+static POOL: Lazy<Mutex<rayon::ThreadPool>> = Lazy::new(|| {
2525+ let num_threads = std::cmp::max(1, num_cpus::get() / 2);
2626+ let pool = rayon::ThreadPoolBuilder::new()
2727+ .num_threads(num_threads)
2828+ .build()
2929+ .unwrap();
3030+ Mutex::new(pool)
3131+});
3232+3333+#[pyfunction]
3434+fn set_num_threads(num_threads: usize) -> PyResult<()> {
3535+ let pool = rayon::ThreadPoolBuilder::new()
3636+ .num_threads(num_threads)
3737+ .build()
3838+ .map_err(|e| PyValueError::new_err(e.to_string()))?;
3939+ *POOL.lock() = pool;
4040+ Ok(())
4141+}
19422043#[pyclass(name = "RootFile")]
2144struct PyRootFile {
···4164 name: String,
4265}
43666767+fn tree_to_dataframe(
6868+ tree: &::oxyroot::ReaderTree,
6969+ columns: Option<Vec<String>>,
7070+ ignore_columns: Option<Vec<String>>,
7171+) -> PyResult<DataFrame> {
7272+ let mut branches_to_save = if let Some(columns) = columns {
7373+ columns
7474+ } else {
7575+ tree.branches().map(|b| b.name().to_string()).collect()
7676+ };
7777+7878+ if let Some(ignore_columns) = ignore_columns {
7979+ branches_to_save.retain(|c| !ignore_columns.contains(c));
8080+ }
8181+8282+ let mut series_vec = Vec::new();
8383+8484+ for branch_name in branches_to_save {
8585+ let branch = match tree.branch(&branch_name) {
8686+ Some(branch) => branch,
8787+ None => {
8888+ println!("Branch '{}' not found, skipping", branch_name);
8989+ continue;
9090+ }
9191+ };
9292+9393+ let series = match branch.item_type_name().as_str() {
9494+ "float" => {
9595+ let data = branch.as_iter::<f32>().unwrap().collect::<Vec<_>>();
9696+ Series::new((&branch_name).into(), data)
9797+ }
9898+ "double" => {
9999+ let data = branch.as_iter::<f64>().unwrap().collect::<Vec<_>>();
100100+ Series::new((&branch_name).into(), data)
101101+ }
102102+ "int32_t" => {
103103+ let data = branch.as_iter::<i32>().unwrap().collect::<Vec<_>>();
104104+ Series::new((&branch_name).into(), data)
105105+ }
106106+ "int64_t" => {
107107+ let data = branch.as_iter::<i64>().unwrap().collect::<Vec<_>>();
108108+ Series::new((&branch_name).into(), data)
109109+ }
110110+ "uint32_t" => {
111111+ let data = branch.as_iter::<u32>().unwrap().collect::<Vec<_>>();
112112+ Series::new((&branch_name).into(), data)
113113+ }
114114+ "uint64_t" => {
115115+ let data = branch.as_iter::<u64>().unwrap().collect::<Vec<_>>();
116116+ Series::new((&branch_name).into(), data)
117117+ }
118118+ "string" => {
119119+ let data = branch.as_iter::<String>().unwrap().collect::<Vec<_>>();
120120+ Series::new((&branch_name).into(), data)
121121+ }
122122+ other => {
123123+ println!("Unsupported branch type: {}, skipping", other);
124124+ continue;
125125+ }
126126+ };
127127+ series_vec.push(series);
128128+ }
129129+130130+ DataFrame::new(series_vec.into_iter().map(|s| s.into()).collect())
131131+ .map_err(|e| PyValueError::new_err(e.to_string()))
132132+}
133133+44134#[pymethods]
45135impl PyRootFile {
46136 #[new]
···96186 )
97187 }
981889999- #[pyo3(signature = (columns = None))]
100100- fn arrays(&self, columns: Option<Vec<String>>) -> PyResult<PyDataFrame> {
189189+ #[pyo3(signature = (columns = None, ignore_columns = None))]
190190+ fn arrays(
191191+ &self,
192192+ columns: Option<Vec<String>>,
193193+ ignore_columns: Option<Vec<String>>,
194194+ ) -> PyResult<PyDataFrame> {
101195 let mut file =
102196 RootFile::open(&self.path).map_err(|e| PyValueError::new_err(e.to_string()))?;
103197 let tree = file
104198 .get_tree(&self.name)
105199 .map_err(|e| PyValueError::new_err(e.to_string()))?;
106106-107107- let branches_to_save = if let Some(columns) = columns {
108108- columns
109109- } else {
110110- tree.branches().map(|b| b.name().to_string()).collect()
111111- };
112112-113113- let mut series_vec = Vec::new();
114114-115115- for branch_name in branches_to_save {
116116- let branch = match tree.branch(&branch_name) {
117117- Some(branch) => branch,
118118- None => {
119119- println!("Branch '{}' not found, skipping", branch_name);
120120- continue;
121121- }
122122- };
123123-124124- let series = match branch.item_type_name().as_str() {
125125- "float" => {
126126- let data = branch.as_iter::<f32>().unwrap().collect::<Vec<_>>();
127127- Series::new((&branch_name).into(), data)
128128- }
129129- "double" => {
130130- let data = branch.as_iter::<f64>().unwrap().collect::<Vec<_>>();
131131- Series::new((&branch_name).into(), data)
132132- }
133133- "int32_t" => {
134134- let data = branch.as_iter::<i32>().unwrap().collect::<Vec<_>>();
135135- Series::new((&branch_name).into(), data)
136136- }
137137- "int64_t" => {
138138- let data = branch.as_iter::<i64>().unwrap().collect::<Vec<_>>();
139139- Series::new((&branch_name).into(), data)
140140- }
141141- "uint32_t" => {
142142- let data = branch.as_iter::<u32>().unwrap().collect::<Vec<_>>();
143143- Series::new((&branch_name).into(), data)
144144- }
145145- "uint64_t" => {
146146- let data = branch.as_iter::<u64>().unwrap().collect::<Vec<_>>();
147147- Series::new((&branch_name).into(), data)
148148- }
149149- "string" => {
150150- let data = branch.as_iter::<String>().unwrap().collect::<Vec<_>>();
151151- Series::new((&branch_name).into(), data)
152152- }
153153- other => {
154154- println!("Unsupported branch type: {}, skipping", other);
155155- continue;
156156- }
157157- };
158158- series_vec.push(series);
159159- }
160160-161161- let df = DataFrame::new(series_vec.into_iter().map(|s| s.into()).collect())
162162- .map_err(|e| PyValueError::new_err(e.to_string()))?;
200200+ let df = tree_to_dataframe(&tree, columns, ignore_columns)?;
163201 Ok(PyDataFrame(df))
164202 }
165203···390428 Ok(env!("CARGO_PKG_VERSION").to_string())
391429}
392430431431+#[pyfunction]
432432+#[pyo3(signature = (paths, tree_name, columns = None, ignore_columns = None))]
433433+fn concat_trees(
434434+ paths: Vec<String>,
435435+ tree_name: String,
436436+ columns: Option<Vec<String>>,
437437+ ignore_columns: Option<Vec<String>>,
438438+) -> PyResult<PyDataFrame> {
439439+ let mut all_paths = Vec::new();
440440+ for path in paths {
441441+ for entry in glob::glob(&path).map_err(|e| PyValueError::new_err(e.to_string()))? {
442442+ match entry {
443443+ Ok(path) => {
444444+ all_paths.push(path.to_str().unwrap().to_string());
445445+ }
446446+ Err(e) => return Err(PyValueError::new_err(e.to_string())),
447447+ }
448448+ }
449449+ }
450450+451451+ let pool = POOL.lock();
452452+ let dfs: Vec<DataFrame> = pool.install(|| {
453453+ all_paths
454454+ .par_iter()
455455+ .map(|path| {
456456+ let mut file =
457457+ RootFile::open(path).map_err(|e| PyValueError::new_err(e.to_string()))?;
458458+ let tree = file
459459+ .get_tree(&tree_name)
460460+ .map_err(|e| PyValueError::new_err(e.to_string()))?;
461461+ tree_to_dataframe(&tree, columns.clone(), ignore_columns.clone())
462462+ })
463463+ .filter_map(Result::ok)
464464+ .collect()
465465+ });
466466+467467+ if dfs.is_empty() {
468468+ return Ok(PyDataFrame(DataFrame::default()));
469469+ }
470470+471471+ let combined_df = concat_df(&dfs).map_err(|e| PyValueError::new_err(e.to_string()))?;
472472+473473+ Ok(PyDataFrame(combined_df))
474474+}
475475+393476/// A Python module to read root files, implemented in Rust.
394477#[pymodule]
395478fn oxyroot(m: &Bound<'_, PyModule>) -> PyResult<()> {
396479 m.add_function(wrap_pyfunction!(version, m)?)?;
397480 m.add_function(wrap_pyfunction!(open, m)?)?;
481481+ m.add_function(wrap_pyfunction!(concat_trees, m)?)?;
482482+ m.add_function(wrap_pyfunction!(set_num_threads, m)?)?;
398483 m.add_class::<PyRootFile>()?;
399484 m.add_class::<PyTree>()?;
400485 m.add_class::<PyBranch>()?;