From fa838d5aaf63f7485f96b80da023b2eebbdbec73 Mon Sep 17 00:00:00 2001 From: Bibek Bhattarai Date: Tue, 12 May 2026 23:57:36 +0000 Subject: [PATCH 1/5] Added the material for XGBoost optimization --- README.md | 1 + software/xgboost/README.md | 332 +++++++++++++++++++++++++++++++++++++ 2 files changed, 333 insertions(+) create mode 100644 software/xgboost/README.md diff --git a/README.md b/README.md index bffdb79..e03c4ca 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,7 @@ We aim to provide a dynamic resource where users can find the latest optimizatio - [Similarity Search](software/similarity-search/README.md) - [Redis](software/similarity-search/redis/README.md) - [Spark](software/spark/README.md) + - [XGBoost](software/xgboost/README.md) - [MySQL & PostgreSQL](software/mysql-postgresql/README.md) - [Kafka](software/kafka/README.md) - [TensorFlow](software/tensorflow/) diff --git a/software/xgboost/README.md b/software/xgboost/README.md new file mode 100644 index 0000000..9e6f8ba --- /dev/null +++ b/software/xgboost/README.md @@ -0,0 +1,332 @@ +# XGBoost Optimization on Intel® Processors + +## Introduction + +[XGBoost](https://xgboost.readthedocs.io/) is one of the most popular and efficient gradient boosting frameworks for classification and regression tasks on tabular data. This guide covers techniques to significantly accelerate XGBoost inference on Intel® Xeon® processors using Intel® oneAPI Data Analytics Library (oneDAL) via its Python interface, `daal4py`. + +By converting trained XGBoost models to oneDAL, you can achieve **up to 36x faster inference** with no loss in prediction quality and minimal code changes. oneDAL leverages Intel® Advanced Vector Extensions 512 (AVX-512) and optimized memory access patterns to maximize performance on Intel hardware. + +## Contents + +- [References](#references) +- [Prerequisites](#prerequisites) +- [Installation](#installation) +- [Accelerating XGBoost Inference with oneDAL](#accelerating-xgboost-inference-with-onedal) + - [Convert and Predict (Simplified API)](#convert-and-predict-simplified-api) + - [Classification Example](#classification-example) + - [Regression Example](#regression-example) + - [Getting Prediction Probabilities](#getting-prediction-probabilities) + - [Saving and Loading Converted Models](#saving-and-loading-converted-models) +- [Performance Results](#performance-results) +- [How It Works](#how-it-works) +- [Configuration Recommendations](#configuration-recommendations) + - [Scaling Inference on Multi-Socket Systems](#scaling-inference-on-multi-socket-systems) + +## References + +- [Faster XGBoost, LightGBM, and CatBoost Inference on the CPU (Intel Developer)](https://www.intel.com/content/www/us/en/developer/articles/technical/faster-xgboost-light-gbm-catboost-inference-on-cpu.html) +- [Improving the Performance of XGBoost and LightGBM Inference (Intel Analytics Software)](https://medium.com/intel-analytics-software/improving-the-performance-of-xgboost-and-lightgbm-inference-3b542c03447e) +- [Fast Gradient Boosting Tree Inference for Intel Xeon Processors (Intel Analytics Software)](https://medium.com/intel-analytics-software/fast-gradient-boosting-tree-inference-for-intel-xeon-processors-35756f174f55) +- [daal4py Model Builders Documentation](https://intelpython.github.io/daal4py/model-builders.html) +- [oneDAL GitHub Repository](https://github.com/oneapi-src/oneDAL) +- [Intel Extension for Scikit-learn (sklearnex)](https://github.com/intel/scikit-learn-intelex) + +## Prerequisites + +- Intel® Xeon® Scalable Processor (2nd Generation or newer recommended for AVX-512 support) +- Python 3.9 or higher +- XGBoost installed (`xgboost` package) + +## Installation + +Install `daal4py` from PyPI: + +```bash +pip install daal4py +``` + +Or from conda-forge: + +```bash +conda install -c conda-forge daal4py --override-channels +``` + +## Accelerating XGBoost Inference with oneDAL + +The core optimization is straightforward: train your model with XGBoost as usual, then convert it to a oneDAL model for faster inference. No changes to your training code are required. + +### Convert and Predict (Simplified API) + +The simplest approach uses the `d4p.mb.convert_model()` API: + +```python +import xgboost as xgb +import daal4py as d4p + +# Train your XGBoost model as usual +clf = xgb.XGBClassifier(**params) +clf.fit(X_train, y_train) + +# Convert to oneDAL model (one line) +d4p_model = d4p.mb.convert_model(clf) + +# Run inference with oneDAL acceleration +predictions = d4p_model.predict(X_test) +``` + +This same API also works with LightGBM and CatBoost models: + +```python +# LightGBM +d4p_model = d4p.mb.convert_model(lgb_model) + +# CatBoost +d4p_model = d4p.mb.convert_model(cb_model) +``` + +### Classification Example + +```python +import numpy as np +import xgboost as xgb +import daal4py as d4p +from sklearn.datasets import make_classification +from sklearn.model_selection import train_test_split + +# Generate sample data +X, y = make_classification(n_samples=10000, n_features=50, random_state=42) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + +# Train with XGBoost +params = { + "n_estimators": 100, + "max_depth": 8, + "learning_rate": 0.1, + "objective": "binary:logistic", + "eval_metric": "logloss", +} +clf = xgb.XGBClassifier(**params) +clf.fit(X_train, y_train) + +# Convert to oneDAL for faster inference +d4p_model = d4p.mb.convert_model(clf) + +# Predict with oneDAL acceleration +d4p_predictions = d4p_model.predict(X_test) +``` + +### Regression Example + +```python +import xgboost as xgb +import daal4py as d4p +from sklearn.datasets import make_regression +from sklearn.model_selection import train_test_split + +# Generate sample data +X, y = make_regression(n_samples=10000, n_features=50, random_state=42) +X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) + +# Train with XGBoost +reg = xgb.XGBRegressor(n_estimators=100, max_depth=8, learning_rate=0.1) +reg.fit(X_train, y_train) + +# Convert and predict with oneDAL +d4p_model = d4p.mb.convert_model(reg) +d4p_predictions = d4p_model.predict(X_test) +``` + +### Getting Prediction Probabilities + +For classification tasks, you can request both labels and probabilities: + +```python +import daal4py as d4p + +# Using the lower-level API for more control +daal_model = d4p.get_gbt_model_from_xgboost(clf.get_booster()) + +predict_algo = d4p.gbt_classification_prediction( + nClasses=n_classes, + resultsToEvaluate="computeClassLabels|computeClassProbabilities" +) +daal_prediction = predict_algo.compute(X_test, daal_model) + +# Access results +labels = daal_prediction.prediction +probabilities = daal_prediction.probabilities +``` + +### Saving and Loading Converted Models + +Converted oneDAL models can be serialized with `pickle` for deployment: + +```python +import pickle +import daal4py as d4p + +# Convert from XGBoost +d4p_model = d4p.mb.convert_model(xgb_model) + +# Save the converted model +with open("d4p_model.pkl", "wb") as f: + pickle.dump(d4p_model, f) + +# Load and predict (no XGBoost dependency needed at inference time) +with open("d4p_model.pkl", "rb") as f: + model = pickle.load(f) + +predictions = model.predict(X_test) +``` + +## Performance Results + +### daal4py (oneDAL) Inference Speedup over Native Libraries + +The following results were measured on an Intel® Xeon® Platinum 8592+ (Emerald Rapids), 2 sockets, 64 cores/socket, 256 threads, 503 GB RAM. Benchmarks were pinned to a single NUMA node (cores 0–31) using `numactl --localalloc --physcpubind=0-31`. Each model was trained with 100 estimators at max depth 8. Inference was measured over 100 iterations after warmup. Speedup = native library inference time / daal4py inference time. + +| Dataset | Rows | Features | Task | daal4py vs XGBoost | daal4py vs LightGBM | daal4py vs CatBoost | +|:--------|-----:|---------:|:-----|-------------------:|--------------------:|--------------------:| +| Abalone | 4,177 | 8 | Regression | 2.66x | 3.53x | 6.12x | +| HIGGS-1M | 940,160 | 24 | Classification | 1.87x | 6.10x | 9.25x | +| MLSR | 203 | 12,600 | Regression | 8.02x | 2.51x | 25.91x | +| Mortgage-1Q | 500,000 | 45 | Regression | 1.24x | 1.66x | 5.27x | +| PLAsTiCC | 200,000 | 60 | Classification | 2.81x | 6.50x | 1.11x | +| Airline | 26,969 | 7 | Classification | 1.73x | 3.55x | 10.01x | + +**Software versions:** XGBoost 2.1.4, LightGBM 4.6.0, CatBoost 1.2.10, daal4py 2024.7, Python 3.10.12, scikit-learn 1.5.2 + +**Hardware:** Intel® Xeon® Platinum 8592+ (Emerald Rapids), 2S/64C/128T per socket, HT On, 503 GB DDR5, single NUMA node + +Across all datasets, daal4py consistently accelerates inference for all three gradient boosting frameworks. CatBoost sees the largest gains (up to 25.9x on MLSR), while LightGBM and XGBoost benefit most on larger datasets and higher-dimensional feature spaces. Prediction quality is preserved — match rates are 99.7–100% across all tests. + +### Reproducing the Benchmark + +The core benchmarking loop measures native vs daal4py inference time after warmup: + +```python +import time +import numpy as np +import daal4py as d4p + +# model = trained XGBoost, LightGBM, or CatBoost model +# X_test = numpy float32 test array + +# Convert the model (one line, works for all three frameworks) +d4p_model = d4p.mb.convert_model(model) + +# Warmup +for _ in range(5): + model.predict(X_test) + d4p_model.predict(X_test) + +# Measure native inference +n_iter = 100 +native_times = [] +for _ in range(n_iter): + t0 = time.perf_counter() + model.predict(X_test) + native_times.append(time.perf_counter() - t0) + +# Measure daal4py inference +d4p_times = [] +for _ in range(n_iter): + t0 = time.perf_counter() + d4p_model.predict(X_test) + d4p_times.append(time.perf_counter() - t0) + +speedup = np.mean(native_times) / np.mean(d4p_times) +print(f"Speedup: {speedup:.2f}x") +``` + +*Performance varies by use, configuration, and other factors.* + +## How It Works + +oneDAL achieves faster GBT inference through two key optimizations: + +### AVX-512 Vectorized Tree Traversal +oneDAL uses Intel AVX-512 vector instructions (`vpgatherd` and `vcmpp`) to process multiple observations through decision trees simultaneously. Instead of traversing one observation at a time, it processes a block of rows through each tree in parallel using SIMD operations for node comparisons and index computations. + +### Cache-Optimized Memory Access +Tree structures are blocked in memory so that a subset of trees and a block of observations fit in the L1 data cache. This ensures the majority of memory accesses are served from L1 cache at maximum bandwidth, rather than incurring costly main memory accesses. + +## Configuration Recommendations + +| Setting | Recommendation | +|:--------|:---------------| +| Data Format | Use NumPy contiguous arrays (`np.ascontiguousarray()`) as input for best performance | +| Data Type | Use `float32` for maximum throughput; `float64` is also supported | +| Batch Size | oneDAL excels at all batch sizes, with the largest advantage at batch size = 1 (online inference) | +| NUMA | For multi-socket systems, pin processes to a single NUMA node to minimize cross-socket memory access | +| daal4py Version | Use the latest version for CatBoost support, missing values support, and performance improvements | + +### Scaling Inference on Multi-Socket Systems + +On multi-socket Intel Xeon systems, there are two key decisions that significantly impact daal4py inference performance: **how to scale across NUMA nodes** and **whether to use hyperthreads**. + +#### Thread Scaling vs. Process Scaling + +A single daal4py process uses internal threading (TBB/OpenMP) to parallelize across available cores. Alternatively, you can run multiple independent OS-level processes, each pinned to a separate NUMA node with its own copy of the model and data. These approaches offer different tradeoffs. + +Testing on a 4-NUMA-node Intel Xeon Platinum 8592+ (200K rows, 24 features, 100 trees, `numactl --localalloc`) showed: + +| Configuration | Throughput (rows/s) | p50 Latency (us) | Scaling | +|:--------------|--------------------:|------------------:|:--------| +| **Thread scaling** (single process, daal internal threading) | | | | +| 1 NUMA node (32 cores) | ~15–17M | ~2,300 | 1.0x | +| 1 socket (64 cores) | ~20M | ~1,500 | 1.3x | +| 2 sockets (128 cores) | ~32M | ~1,230 | 2.1x | +| **Process scaling** (separate NUMA-pinned OS processes) | | | | +| 1 process (32 cores) | ~18M | ~2,280 | 1.0x | +| 2 processes, 1 per NUMA node (64 cores) | ~38M | ~2,040 | 2.1x | +| 4 processes, 1 per NUMA node (128 cores) | ~73M | ~2,090 | 4.1x | + +Key observations: +- **Process scaling is nearly linear** — 4 NUMA-pinned processes achieve **4.1x** the throughput of a single process. Each worker has its own model, data, and local memory, with zero cross-NUMA traffic. +- **Thread scaling is sub-linear** — using 4x the cores in a single process yields only **2.1x** throughput, because cross-socket memory coherency traffic limits scaling. +- **The tradeoff is latency**: thread scaling achieves **lower per-request latency** (1,230 us at 128 cores) because all cores collaborate on each prediction. Process scaling maintains a fixed latency (~2,000 us per worker, 32 cores each) but delivers **higher aggregate throughput**. + +#### Hyperthreading Hurts Performance + +daal4py's AVX-512 vectorized tree traversal is backend-bound — whether the bottleneck is core execution units or memory bandwidth, adding hyperthreads increases resource contention on the shared physical core, harming performance. + +| Configuration (1 NUMA node) | Throughput (rows/s) | p50 Latency (us) | +|:-----------------------------|--------------------:|------------------:| +| 32 physical cores only (`--physcpubind=0-31`) | ~18M | ~2,000 | +| 64 threads with HT (`--physcpubind=0-31,128-159` or `--cpunodebind=0`) | ~8.5M | ~4,760 | + +Enabling hyperthreads **halves throughput and doubles latency**, regardless of whether you use `--cpunodebind` or `--physcpubind` to specify them. The penalty comes from HT siblings competing for the same AVX-512 execution units and cache lines that daal4py relies on. + +#### Recommendations + +**For latency-sensitive inference** (single request at a time), use thread scaling with all physical cores: + +```bash +# Use all 128 physical cores across both sockets for lowest per-request latency +numactl --localalloc --physcpubind=0-127 python my_inference.py +``` + +**For throughput-oriented serving** (batch processing or concurrent clients), run one process per NUMA node, each pinned to physical cores only: + +```bash +# 4 NUMA-pinned workers for maximum aggregate throughput +numactl --localalloc --physcpubind=0-31 python my_inference.py --shard=0 & +numactl --localalloc --physcpubind=32-63 python my_inference.py --shard=1 & +numactl --localalloc --physcpubind=64-95 python my_inference.py --shard=2 & +numactl --localalloc --physcpubind=96-127 python my_inference.py --shard=3 & +``` + +**Always pin to physical cores** — use `--physcpubind` with physical core IDs, not `--cpunodebind` which includes hyperthread siblings. On systems where HT cannot be disabled in BIOS, explicit `--physcpubind` ranges are essential. + +#### Memory Allocator + +Alternative memory allocators such as jemalloc or tcmalloc can sometimes improve performance over the default glibc malloc. It is recommended to test with these enabled to see if either provides a benefit for your workload: + +```bash +# jemalloc +LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2 python my_inference.py + +# tcmalloc +LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4 python my_inference.py +``` From 6bd1b6b19c1442d421994284a8f7b64907e90754 Mon Sep 17 00:00:00 2001 From: Bibek Bhattarai Date: Tue, 19 May 2026 21:05:40 +0000 Subject: [PATCH 2/5] Fixed most of the comments on PR except the result data --- software/xgboost/README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/software/xgboost/README.md b/software/xgboost/README.md index 9e6f8ba..0714f87 100644 --- a/software/xgboost/README.md +++ b/software/xgboost/README.md @@ -2,7 +2,7 @@ ## Introduction -[XGBoost](https://xgboost.readthedocs.io/) is one of the most popular and efficient gradient boosting frameworks for classification and regression tasks on tabular data. This guide covers techniques to significantly accelerate XGBoost inference on Intel® Xeon® processors using Intel® oneAPI Data Analytics Library (oneDAL) via its Python interface, `daal4py`. +[XGBoost](https://xgboost.readthedocs.io/) is one of the most popular and efficient gradient boosting frameworks for classification and regression tasks on tabular data. This guide covers techniques to significantly accelerate XGBoost inference on Intel® Xeon® processors using [Intel® oneAPI Data Analytics Library (oneDAL)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/onedal.html) via its Python interface, `daal4py`. By converting trained XGBoost models to oneDAL, you can achieve **up to 36x faster inference** with no loss in prediction quality and minimal code changes. oneDAL leverages Intel® Advanced Vector Extensions 512 (AVX-512) and optimized memory access patterns to maximize performance on Intel hardware. @@ -28,7 +28,7 @@ By converting trained XGBoost models to oneDAL, you can achieve **up to 36x fast - [Improving the Performance of XGBoost and LightGBM Inference (Intel Analytics Software)](https://medium.com/intel-analytics-software/improving-the-performance-of-xgboost-and-lightgbm-inference-3b542c03447e) - [Fast Gradient Boosting Tree Inference for Intel Xeon Processors (Intel Analytics Software)](https://medium.com/intel-analytics-software/fast-gradient-boosting-tree-inference-for-intel-xeon-processors-35756f174f55) - [daal4py Model Builders Documentation](https://intelpython.github.io/daal4py/model-builders.html) -- [oneDAL GitHub Repository](https://github.com/oneapi-src/oneDAL) +- [oneDAL GitHub Repository](https://github.com/uxlfoundation/oneDAL) - [Intel Extension for Scikit-learn (sklearnex)](https://github.com/intel/scikit-learn-intelex) ## Prerequisites @@ -196,7 +196,7 @@ The following results were measured on an Intel® Xeon® Platinum 8592+ (Emerald **Software versions:** XGBoost 2.1.4, LightGBM 4.6.0, CatBoost 1.2.10, daal4py 2024.7, Python 3.10.12, scikit-learn 1.5.2 -**Hardware:** Intel® Xeon® Platinum 8592+ (Emerald Rapids), 2S/64C/128T per socket, HT On, 503 GB DDR5, single NUMA node +**Hardware:** Intel® Xeon® Platinum 8592+ (Emerald Rapids), 2 sockets, 64 cores/socket, 256 threads, HT On, 503 GB DDR5, single NUMA node Across all datasets, daal4py consistently accelerates inference for all three gradient boosting frameworks. CatBoost sees the largest gains (up to 25.9x on MLSR), while LightGBM and XGBoost benefit most on larger datasets and higher-dimensional feature spaces. Prediction quality is preserved — match rates are 99.7–100% across all tests. @@ -212,7 +212,7 @@ import daal4py as d4p # model = trained XGBoost, LightGBM, or CatBoost model # X_test = numpy float32 test array -# Convert the model (one line, works for all three frameworks) +# Convert the model (works for XGBoost, LightGBM, and CatBoost) d4p_model = d4p.mb.convert_model(model) # Warmup @@ -257,9 +257,9 @@ Tree structures are blocked in memory so that a subset of trees and a block of o |:--------|:---------------| | Data Format | Use NumPy contiguous arrays (`np.ascontiguousarray()`) as input for best performance | | Data Type | Use `float32` for maximum throughput; `float64` is also supported | -| Batch Size | oneDAL excels at all batch sizes, with the largest advantage at batch size = 1 (online inference) | +| Batch Size | oneDAL performs well across batch sizes, with the largest advantage at batch size = 1 (online inference) | | NUMA | For multi-socket systems, pin processes to a single NUMA node to minimize cross-socket memory access | -| daal4py Version | Use the latest version for CatBoost support, missing values support, and performance improvements | +| daal4py Version | Use daal4py 2023.2 or newer (required for missing values support). Each release includes additional optimizations and bug fixes, so the latest version is recommended | ### Scaling Inference on Multi-Socket Systems @@ -287,9 +287,9 @@ Key observations: - **Thread scaling is sub-linear** — using 4x the cores in a single process yields only **2.1x** throughput, because cross-socket memory coherency traffic limits scaling. - **The tradeoff is latency**: thread scaling achieves **lower per-request latency** (1,230 us at 128 cores) because all cores collaborate on each prediction. Process scaling maintains a fixed latency (~2,000 us per worker, 32 cores each) but delivers **higher aggregate throughput**. -#### Hyperthreading Hurts Performance +#### Hyper-threading Hurts Performance -daal4py's AVX-512 vectorized tree traversal is backend-bound — whether the bottleneck is core execution units or memory bandwidth, adding hyperthreads increases resource contention on the shared physical core, harming performance. +daal4py's AVX-512 vectorized tree traversal is [backend-bound](https://www.intel.com/content/www/us/en/docs/vtune-profiler/cookbook/2023-0/top-down-microarchitecture-analysis-method.html) — whether the bottleneck is core execution units or memory bandwidth, adding hyperthreads increases resource contention on the shared physical core, harming performance. | Configuration (1 NUMA node) | Throughput (rows/s) | p50 Latency (us) | |:-----------------------------|--------------------:|------------------:| From bf1fa05360f14106b0cefa12dd498e8be059faa0 Mon Sep 17 00:00:00 2001 From: Bibek Bhattarai Date: Mon, 29 Jun 2026 21:22:38 +0000 Subject: [PATCH 3/5] Updating the verified data, addressed review comments on using scikit-learn, removing memory allocator section, and clarifying the scope to include all 3 methods --- software/xgboost/README.md | 118 +++++++++++++++++++++---------------- 1 file changed, 68 insertions(+), 50 deletions(-) diff --git a/software/xgboost/README.md b/software/xgboost/README.md index 0714f87..f9cebf5 100644 --- a/software/xgboost/README.md +++ b/software/xgboost/README.md @@ -1,10 +1,12 @@ -# XGBoost Optimization on Intel® Processors +# Gradient Boosting Inference Optimization on Intel® Processors ## Introduction -[XGBoost](https://xgboost.readthedocs.io/) is one of the most popular and efficient gradient boosting frameworks for classification and regression tasks on tabular data. This guide covers techniques to significantly accelerate XGBoost inference on Intel® Xeon® processors using [Intel® oneAPI Data Analytics Library (oneDAL)](https://www.intel.com/content/www/us/en/developer/tools/oneapi/onedal.html) via its Python interface, `daal4py`. +[XGBoost](https://xgboost.readthedocs.io/), [LightGBM](https://lightgbm.readthedocs.io/), and [CatBoost](https://catboost.ai/) are among the most popular and efficient gradient boosting frameworks for classification and regression tasks on tabular data. This guide covers techniques to significantly accelerate inference for these frameworks on Intel® Xeon® processors using [oneDAL (oneAPI Data Analytics Library)](http://uxlfoundation.github.io/oneDAL/) via its Python interface, `daal4py`, provided through the [`scikit-learn-intelex`](https://github.com/intel/scikit-learn-intelex) package. -By converting trained XGBoost models to oneDAL, you can achieve **up to 36x faster inference** with no loss in prediction quality and minimal code changes. oneDAL leverages Intel® Advanced Vector Extensions 512 (AVX-512) and optimized memory access patterns to maximize performance on Intel hardware. +By converting trained models to oneDAL, you can achieve **orders of magnitude faster inference** with no loss in prediction quality and minimal code changes. oneDAL leverages Intel® Advanced Vector Extensions 512 (AVX-512) and optimized memory access patterns to maximize performance on Intel hardware. + +> **Note:** `daal4py` supports a specific subset of GBT model configurations (e.g., standard classification and regression trees). For model types not supported by daal4py, consider alternatives such as [ONNX Runtime](https://onnxruntime.ai/) for optimized inference. ## Contents @@ -27,28 +29,35 @@ By converting trained XGBoost models to oneDAL, you can achieve **up to 36x fast - [Faster XGBoost, LightGBM, and CatBoost Inference on the CPU (Intel Developer)](https://www.intel.com/content/www/us/en/developer/articles/technical/faster-xgboost-light-gbm-catboost-inference-on-cpu.html) - [Improving the Performance of XGBoost and LightGBM Inference (Intel Analytics Software)](https://medium.com/intel-analytics-software/improving-the-performance-of-xgboost-and-lightgbm-inference-3b542c03447e) - [Fast Gradient Boosting Tree Inference for Intel Xeon Processors (Intel Analytics Software)](https://medium.com/intel-analytics-software/fast-gradient-boosting-tree-inference-for-intel-xeon-processors-35756f174f55) -- [daal4py Model Builders Documentation](https://intelpython.github.io/daal4py/model-builders.html) +- [scikit-learn-intelex Model Builders Documentation](https://uxlfoundation.github.io/scikit-learn-intelex/latest/model_builders.html) +- [About daal4py](https://uxlfoundation.github.io/scikit-learn-intelex/latest/about_daal4py.html) - [oneDAL GitHub Repository](https://github.com/uxlfoundation/oneDAL) -- [Intel Extension for Scikit-learn (sklearnex)](https://github.com/intel/scikit-learn-intelex) +- [scikit-learn-intelex (sklearnex)](https://github.com/intel/scikit-learn-intelex) ## Prerequisites - Intel® Xeon® Scalable Processor (2nd Generation or newer recommended for AVX-512 support) -- Python 3.9 or higher -- XGBoost installed (`xgboost` package) +- Python version supported by [scikit-learn-intelex](https://github.com/intel/scikit-learn-intelex) (currently 3.10+) +- One or more gradient boosting libraries: [XGBoost](https://xgboost.readthedocs.io/) (`xgboost` from PyPI or `py-xgboost` from conda-forge), [LightGBM](https://lightgbm.readthedocs.io/) (`lightgbm`), [CatBoost](https://catboost.ai/) (`catboost`) ## Installation -Install `daal4py` from PyPI: +The `daal4py` module is provided through the `scikit-learn-intelex` package. Install from PyPI: ```bash -pip install daal4py +pip install scikit-learn-intelex ``` Or from conda-forge: ```bash -conda install -c conda-forge daal4py --override-channels +conda install -c conda-forge scikit-learn-intelex --override-channels +``` + +Install the gradient boosting libraries you need: + +```bash +pip install xgboost lightgbm catboost ``` ## Accelerating XGBoost Inference with oneDAL @@ -136,27 +145,26 @@ d4p_model = d4p.mb.convert_model(reg) d4p_predictions = d4p_model.predict(X_test) ``` + ### Getting Prediction Probabilities -For classification tasks, you can request both labels and probabilities: +For classification tasks, you can request both labels and probabilities using the high-level API: ```python import daal4py as d4p -# Using the lower-level API for more control -daal_model = d4p.get_gbt_model_from_xgboost(clf.get_booster()) +# Convert the model +d4p_model = d4p.mb.convert_model(clf) -predict_algo = d4p.gbt_classification_prediction( - nClasses=n_classes, - resultsToEvaluate="computeClassLabels|computeClassProbabilities" -) -daal_prediction = predict_algo.compute(X_test, daal_model) +# Get class labels +predictions = d4p_model.predict(X_test) -# Access results -labels = daal_prediction.prediction -probabilities = daal_prediction.probabilities +# Get prediction probabilities +probabilities = d4p_model.predict_proba(X_test) ``` +For full documentation on supported model types and options, see the [Model Builders documentation](https://uxlfoundation.github.io/scikit-learn-intelex/latest/model_builders.html). + ### Saving and Loading Converted Models Converted oneDAL models can be serialized with `pickle` for deployment: @@ -181,24 +189,34 @@ predictions = model.predict(X_test) ## Performance Results -### daal4py (oneDAL) Inference Speedup over Native Libraries +### daal4py (oneDAL) Inference Speedup over Native Libraries (Batch Size = 1) -The following results were measured on an Intel® Xeon® Platinum 8592+ (Emerald Rapids), 2 sockets, 64 cores/socket, 256 threads, 503 GB RAM. Benchmarks were pinned to a single NUMA node (cores 0–31) using `numactl --localalloc --physcpubind=0-31`. Each model was trained with 100 estimators at max depth 8. Inference was measured over 100 iterations after warmup. Speedup = native library inference time / daal4py inference time. +The following results were measured on an AWS r8i.12xlarge instance (Intel® Xeon® Scalable Processor, Granite Rapids, 48 vCPUs, 384 GB RAM). Each model was trained with 1,000 estimators. Inference was measured at batch size = 1 (single-row prediction). Speedup = native library inference time / daal4py inference time. | Dataset | Rows | Features | Task | daal4py vs XGBoost | daal4py vs LightGBM | daal4py vs CatBoost | |:--------|-----:|---------:|:-----|-------------------:|--------------------:|--------------------:| -| Abalone | 4,177 | 8 | Regression | 2.66x | 3.53x | 6.12x | -| HIGGS-1M | 940,160 | 24 | Classification | 1.87x | 6.10x | 9.25x | -| MLSR | 203 | 12,600 | Regression | 8.02x | 2.51x | 25.91x | -| Mortgage-1Q | 500,000 | 45 | Regression | 1.24x | 1.66x | 5.27x | -| PLAsTiCC | 200,000 | 60 | Classification | 2.81x | 6.50x | 1.11x | -| Airline | 26,969 | 7 | Classification | 1.73x | 3.55x | 10.01x | +| Abalone | 4,177 | 8 | Regression | 12.56x | 10.06x | 4.91x | +| Airline | 26,969 | 6,452 | Classification (binary) | 11.27x | 13.01x | 1.85x | +| Airline-OHE | 940,160 | 24 | Classification (binary) | 5.32x | 51.03x | 46.86x | +| Bosch | 6,000,960 | 136 | Classification (binary) | 10.98x | 21.84x | 15.01x | +| Covtype | 500,000 | 45 | Classification (7-class) | 2.56x | 1.49x | 0.20x | +| Epsilon | 200,000 | 60 | Classification (binary) | 8.69x | 28.34x | 23.19x | +| Fraud | 76,020 | 370 | Classification (binary) | 15.78x | 41.55x | 3.58x | +| HIGGS | 26,969 | 7 | Classification (binary) | 10.82x | 13.53x | 2.36x | +| HIGGS-1M | 1,183,747 | 968 | Classification (binary) | 12.26x | 13.91x | 3.01x | +| MLSR | 581,012 | 54 | Regression | 13.67x | 11.61x | 5.73x | +| Mortgage-1Q | 500,000 | 2,000 | Regression | 13.05x | 8.91x | 4.09x | +| PLAsTiCC | 200,000 | 60 | Classification (14-class) | 2.42x | 1.07x | 0.11x | +| Santander | 940,160 | 24 | Classification (binary) | 11.07x | 17.22x | 7.42x | +| Year Prediction MSD | 515,345 | 90 | Regression | 11.59x | 10.46x | 4.56x | + +**Software versions used for benchmarking:** XGBoost 2.1.4, LightGBM 4.6.0, CatBoost 1.2.10, scikit-learn-intelex 2024.7, Python 3.10.12, scikit-learn 1.5.2. For best results, use the latest available versions of these packages. -**Software versions:** XGBoost 2.1.4, LightGBM 4.6.0, CatBoost 1.2.10, daal4py 2024.7, Python 3.10.12, scikit-learn 1.5.2 +**Hardware:** AWS r8i.12xlarge (Intel® Xeon® Scalable Processor, Granite Rapids, 48 vCPUs, 384 GB RAM) -**Hardware:** Intel® Xeon® Platinum 8592+ (Emerald Rapids), 2 sockets, 64 cores/socket, 256 threads, HT On, 503 GB DDR5, single NUMA node +Across all datasets, daal4py consistently accelerates inference for all three gradient boosting frameworks. LightGBM sees the largest gains (up to 51x on Airline-OHE), XGBoost achieves 5–16x speedup across all workloads, and CatBoost benefits most on high-dimensional binary classification tasks. -Across all datasets, daal4py consistently accelerates inference for all three gradient boosting frameworks. CatBoost sees the largest gains (up to 25.9x on MLSR), while LightGBM and XGBoost benefit most on larger datasets and higher-dimensional feature spaces. Prediction quality is preserved — match rates are 99.7–100% across all tests. +For multiclass classification, default XGBoost, LightGBM, and daal4py all use one tree per class. CatBoost, on the other hand, uses vectorized trees. This means all other approaches end up processing `num_classes x` more trees compared to CatBoost, e.g., 7,000 vs 1,000 for Covtype. For smaller `num_estimators` like `100`, `daal4py` outperforms CatBoost, but as `num_estimators` gets larger, CatBoost provides better inference latency. ### Reproducing the Benchmark @@ -243,13 +261,21 @@ print(f"Speedup: {speedup:.2f}x") ## How It Works -oneDAL achieves faster GBT inference through two key optimizations: +The speedup from oneDAL comes from three primary factors: -### AVX-512 Vectorized Tree Traversal -oneDAL uses Intel AVX-512 vector instructions (`vpgatherd` and `vcmpp`) to process multiple observations through decision trees simultaneously. Instead of traversing one observation at a time, it processes a block of rows through each tree in parallel using SIMD operations for node comparisons and index computations. +### 1. Python/Framework Overhead Elimination -### Cache-Optimized Memory Access -Tree structures are blocked in memory so that a subset of trees and a block of observations fit in the L1 data cache. This ensures the majority of memory accesses are served from L1 cache at maximum bandwidth, rather than incurring costly main memory accesses. +Native Python-based prediction (XGBoost, LightGBM, CatBoost) incurs significant per-prediction overhead: interpreter dispatch, type checking, array conversion, reference counting, and Python-to-C++ data marshalling. The majority of CPU time in native inference is spent in this framework glue code rather than actual tree traversal. + +By converting the model to a native C++ representation, oneDAL eliminates this overhead entirely. The prediction hot path runs without any Python interpreter involvement. + +### 2. Vectorized Tree Traversal + +oneDAL uses SIMD instructions (AVX2/AVX-512) to traverse decision trees. Instead of scalar node-by-node comparisons, it processes multiple tree nodes or observations in parallel using vector gather and compare operations. This means the actual tree traversal computation is concentrated in a tight, optimized loop rather than being spread across many small framework functions. + +### 3. Reduced Kernel and Synchronization Overhead + +Native frameworks spend a notable portion of time in kernel space due to Python GIL contention and threading layer interactions (syscalls, thread scheduling, locks). oneDAL minimizes this by keeping execution in user space with efficient thread parallelism. ## Configuration Recommendations @@ -259,7 +285,7 @@ Tree structures are blocked in memory so that a subset of trees and a block of o | Data Type | Use `float32` for maximum throughput; `float64` is also supported | | Batch Size | oneDAL performs well across batch sizes, with the largest advantage at batch size = 1 (online inference) | | NUMA | For multi-socket systems, pin processes to a single NUMA node to minimize cross-socket memory access | -| daal4py Version | Use daal4py 2023.2 or newer (required for missing values support). Each release includes additional optimizations and bug fixes, so the latest version is recommended | +| scikit-learn-intelex Version | Use the latest version of `scikit-learn-intelex` for best performance, newest model support, and bug fixes | ### Scaling Inference on Multi-Socket Systems @@ -269,7 +295,7 @@ On multi-socket Intel Xeon systems, there are two key decisions that significant A single daal4py process uses internal threading (TBB/OpenMP) to parallelize across available cores. Alternatively, you can run multiple independent OS-level processes, each pinned to a separate NUMA node with its own copy of the model and data. These approaches offer different tradeoffs. -Testing on a 4-NUMA-node Intel Xeon Platinum 8592+ (200K rows, 24 features, 100 trees, `numactl --localalloc`) showed: +Testing on a 4-NUMA-node Intel Xeon Platinum 8592+ (`airline-ohe` dataset, 200K rows, 24 features, 100 trees, `numactl --localalloc`) showed: | Configuration | Throughput (rows/s) | p50 Latency (us) | Scaling | |:--------------|--------------------:|------------------:|:--------| @@ -287,10 +313,12 @@ Key observations: - **Thread scaling is sub-linear** — using 4x the cores in a single process yields only **2.1x** throughput, because cross-socket memory coherency traffic limits scaling. - **The tradeoff is latency**: thread scaling achieves **lower per-request latency** (1,230 us at 128 cores) because all cores collaborate on each prediction. Process scaling maintains a fixed latency (~2,000 us per worker, 32 cores each) but delivers **higher aggregate throughput**. -#### Hyper-threading Hurts Performance +#### Hyper-threading can Hurt Performance daal4py's AVX-512 vectorized tree traversal is [backend-bound](https://www.intel.com/content/www/us/en/docs/vtune-profiler/cookbook/2023-0/top-down-microarchitecture-analysis-method.html) — whether the bottleneck is core execution units or memory bandwidth, adding hyperthreads increases resource contention on the shared physical core, harming performance. +> **Cloud instance note:** On AWS and GCP, each vCPU does not necessarily map to a hyperthread. Smaller instance sizes use soft partitioning, so you may not know how many physical cores vs. hyperthreads you are getting. The guidance below applies most directly to bare-metal or dedicated-host instances where the physical topology is known. On shared instances, benchmark with your specific instance size to determine whether pinning provides a benefit. + | Configuration (1 NUMA node) | Throughput (rows/s) | p50 Latency (us) | |:-----------------------------|--------------------:|------------------:| | 32 physical cores only (`--physcpubind=0-31`) | ~18M | ~2,000 | @@ -319,14 +347,4 @@ numactl --localalloc --physcpubind=96-127 python my_inference.py --shard=3 & **Always pin to physical cores** — use `--physcpubind` with physical core IDs, not `--cpunodebind` which includes hyperthread siblings. On systems where HT cannot be disabled in BIOS, explicit `--physcpubind` ranges are essential. -#### Memory Allocator - -Alternative memory allocators such as jemalloc or tcmalloc can sometimes improve performance over the default glibc malloc. It is recommended to test with these enabled to see if either provides a benefit for your workload: -```bash -# jemalloc -LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libjemalloc.so.2 python my_inference.py - -# tcmalloc -LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4 python my_inference.py -``` From 078f47b09b900b4d02fc893528982d44dd88fac7 Mon Sep 17 00:00:00 2001 From: Bibek Bhattarai Date: Mon, 29 Jun 2026 22:38:14 +0000 Subject: [PATCH 4/5] Fixing the software versions --- software/xgboost/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/software/xgboost/README.md b/software/xgboost/README.md index f9cebf5..cfde2a2 100644 --- a/software/xgboost/README.md +++ b/software/xgboost/README.md @@ -210,7 +210,7 @@ The following results were measured on an AWS r8i.12xlarge instance (Intel® Xeo | Santander | 940,160 | 24 | Classification (binary) | 11.07x | 17.22x | 7.42x | | Year Prediction MSD | 515,345 | 90 | Regression | 11.59x | 10.46x | 4.56x | -**Software versions used for benchmarking:** XGBoost 2.1.4, LightGBM 4.6.0, CatBoost 1.2.10, scikit-learn-intelex 2024.7, Python 3.10.12, scikit-learn 1.5.2. For best results, use the latest available versions of these packages. +**Software versions used for benchmarking:** XGBoost 3.2.0, LightGBM 4.6.0, CatBoost 1.2.10, scikit-learn-intelex 2026.0.0, Python 3.10.12, scikit-learn 1.7.2. For best results, use the latest available versions of these packages. **Hardware:** AWS r8i.12xlarge (Intel® Xeon® Scalable Processor, Granite Rapids, 48 vCPUs, 384 GB RAM) From 74972138f7ecf162e41dafb923db1bc27891065f Mon Sep 17 00:00:00 2001 From: Bibek Bhattarai Date: Tue, 30 Jun 2026 22:24:33 +0000 Subject: [PATCH 5/5] Fixed the comments from 6/29/2026 --- software/xgboost/README.md | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/software/xgboost/README.md b/software/xgboost/README.md index cfde2a2..2a0153c 100644 --- a/software/xgboost/README.md +++ b/software/xgboost/README.md @@ -2,11 +2,11 @@ ## Introduction -[XGBoost](https://xgboost.readthedocs.io/), [LightGBM](https://lightgbm.readthedocs.io/), and [CatBoost](https://catboost.ai/) are among the most popular and efficient gradient boosting frameworks for classification and regression tasks on tabular data. This guide covers techniques to significantly accelerate inference for these frameworks on Intel® Xeon® processors using [oneDAL (oneAPI Data Analytics Library)](http://uxlfoundation.github.io/oneDAL/) via its Python interface, `daal4py`, provided through the [`scikit-learn-intelex`](https://github.com/intel/scikit-learn-intelex) package. +[XGBoost](https://xgboost.readthedocs.io/), [LightGBM](https://lightgbm.readthedocs.io/), and [CatBoost](https://catboost.ai/) are among the most popular and efficient gradient boosting frameworks for classification and regression tasks on tabular data. This guide covers techniques to significantly accelerate inference for these frameworks on Intel® processors using [oneDAL (oneAPI Data Analytics Library)](http://uxlfoundation.github.io/oneDAL/) via its Python interface, `daal4py`, provided through the [`scikit-learn-intelex`](https://uxlfoundation.github.io/scikit-learn-intelex) package. -By converting trained models to oneDAL, you can achieve **orders of magnitude faster inference** with no loss in prediction quality and minimal code changes. oneDAL leverages Intel® Advanced Vector Extensions 512 (AVX-512) and optimized memory access patterns to maximize performance on Intel hardware. +By converting trained models to oneDAL, you can achieve **orders of magnitude faster inference** with no loss in prediction quality and minimal code changes. oneDAL leverages SIMD vectorization and optimized memory access patterns to maximize performance on Intel hardware. -> **Note:** `daal4py` supports a specific subset of GBT model configurations (e.g., standard classification and regression trees). For model types not supported by daal4py, consider alternatives such as [ONNX Runtime](https://onnxruntime.ai/) for optimized inference. +> **Note:** `daal4py` supports a specific subset of Gradient Boosted Tree (GBT) model configurations (e.g., standard classification and regression trees). For model types not supported by daal4py, consider alternatives such as [ONNX Runtime](https://onnx.ai/sklearn-onnx/auto_tutorial/plot_gexternal_xgboost.html) or [TreeLite/tl2cgen](https://tl2cgen.readthedocs.io/en/latest/) for optimized inference. ## Contents @@ -32,12 +32,12 @@ By converting trained models to oneDAL, you can achieve **orders of magnitude fa - [scikit-learn-intelex Model Builders Documentation](https://uxlfoundation.github.io/scikit-learn-intelex/latest/model_builders.html) - [About daal4py](https://uxlfoundation.github.io/scikit-learn-intelex/latest/about_daal4py.html) - [oneDAL GitHub Repository](https://github.com/uxlfoundation/oneDAL) -- [scikit-learn-intelex (sklearnex)](https://github.com/intel/scikit-learn-intelex) +- [scikit-learn-intelex (sklearnex)](https://uxlfoundation.github.io/scikit-learn-intelex) ## Prerequisites - Intel® Xeon® Scalable Processor (2nd Generation or newer recommended for AVX-512 support) -- Python version supported by [scikit-learn-intelex](https://github.com/intel/scikit-learn-intelex) (currently 3.10+) +- Python version supported by [scikit-learn-intelex](https://uxlfoundation.github.io/scikit-learn-intelex) (currently 3.10+) - One or more gradient boosting libraries: [XGBoost](https://xgboost.readthedocs.io/) (`xgboost` from PyPI or `py-xgboost` from conda-forge), [LightGBM](https://lightgbm.readthedocs.io/) (`lightgbm`), [CatBoost](https://catboost.ai/) (`catboost`) ## Installation @@ -48,7 +48,7 @@ The `daal4py` module is provided through the `scikit-learn-intelex` package. Ins pip install scikit-learn-intelex ``` -Or from conda-forge: +If using a conda environment ([miniforge](https://github.com/conda-forge/miniforge) distribution is recommended): ```bash conda install -c conda-forge scikit-learn-intelex --override-channels @@ -210,13 +210,15 @@ The following results were measured on an AWS r8i.12xlarge instance (Intel® Xeo | Santander | 940,160 | 24 | Classification (binary) | 11.07x | 17.22x | 7.42x | | Year Prediction MSD | 515,345 | 90 | Regression | 11.59x | 10.46x | 4.56x | -**Software versions used for benchmarking:** XGBoost 3.2.0, LightGBM 4.6.0, CatBoost 1.2.10, scikit-learn-intelex 2026.0.0, Python 3.10.12, scikit-learn 1.7.2. For best results, use the latest available versions of these packages. +**Software versions used for benchmarking:** XGBoost 3.2.0, LightGBM 4.6.0, CatBoost 1.2.10, scikit-learn-intelex 2026.0.0, Python 3.10.12. For best results, use the latest available versions of these packages. **Hardware:** AWS r8i.12xlarge (Intel® Xeon® Scalable Processor, Granite Rapids, 48 vCPUs, 384 GB RAM) Across all datasets, daal4py consistently accelerates inference for all three gradient boosting frameworks. LightGBM sees the largest gains (up to 51x on Airline-OHE), XGBoost achieves 5–16x speedup across all workloads, and CatBoost benefits most on high-dimensional binary classification tasks. -For multiclass classification, default XGBoost, LightGBM, and daal4py all use one tree per class. CatBoost, on the other hand, uses vectorized trees. This means all other approaches end up processing `num_classes x` more trees compared to CatBoost, e.g., 7,000 vs 1,000 for Covtype. For smaller `num_estimators` like `100`, `daal4py` outperforms CatBoost, but as `num_estimators` gets larger, CatBoost provides better inference latency. +For multiclass classification, XGBoost, LightGBM, and daal4py (with default settings as of the tested versions) use one tree per class, while CatBoost uses symmetric (oblivious) trees that handle all classes in a single tree. This means daal4py ends up processing `num_classes × num_estimators` trees compared to CatBoost's `num_estimators` trees (e.g., 7,000 vs 1,000 for Covtype with 7 classes). As a result, CatBoost can provide better inference latency for multiclass tasks with many classes and large ensembles. + +> **Note:** XGBoost is moving towards multi-output trees (via `multi_strategy="multi_output_tree"`) which would reduce this gap by handling all classes in a single tree, similar to CatBoost. Check the [XGBoost documentation](https://xgboost.readthedocs.io/en/latest/tutorials/multioutput.html) for the latest defaults. ### Reproducing the Benchmark @@ -233,28 +235,32 @@ import daal4py as d4p # Convert the model (works for XGBoost, LightGBM, and CatBoost) d4p_model = d4p.mb.convert_model(model) +# Set batch size (1 = single-row / online inference) +batch_size = 1 +X_batch = X_test[:batch_size] + # Warmup for _ in range(5): - model.predict(X_test) - d4p_model.predict(X_test) + model.predict(X_batch) + d4p_model.predict(X_batch) # Measure native inference -n_iter = 100 +n_iter = 1000 native_times = [] for _ in range(n_iter): t0 = time.perf_counter() - model.predict(X_test) + model.predict(X_batch) native_times.append(time.perf_counter() - t0) # Measure daal4py inference d4p_times = [] for _ in range(n_iter): t0 = time.perf_counter() - d4p_model.predict(X_test) + d4p_model.predict(X_batch) d4p_times.append(time.perf_counter() - t0) speedup = np.mean(native_times) / np.mean(d4p_times) -print(f"Speedup: {speedup:.2f}x") +print(f"Batch size: {batch_size}, Speedup: {speedup:.2f}x") ``` *Performance varies by use, configuration, and other factors.* @@ -283,7 +289,7 @@ Native frameworks spend a notable portion of time in kernel space due to Python |:--------|:---------------| | Data Format | Use NumPy contiguous arrays (`np.ascontiguousarray()`) as input for best performance | | Data Type | Use `float32` for maximum throughput; `float64` is also supported | -| Batch Size | oneDAL performs well across batch sizes, with the largest advantage at batch size = 1 (online inference) | +| Batch Size | oneDAL performs well across batch sizes; the speedup advantage is most pronounced at small batch sizes where native framework overhead dominates | | NUMA | For multi-socket systems, pin processes to a single NUMA node to minimize cross-socket memory access | | scikit-learn-intelex Version | Use the latest version of `scikit-learn-intelex` for best performance, newest model support, and bug fixes | @@ -293,7 +299,7 @@ On multi-socket Intel Xeon systems, there are two key decisions that significant #### Thread Scaling vs. Process Scaling -A single daal4py process uses internal threading (TBB/OpenMP) to parallelize across available cores. Alternatively, you can run multiple independent OS-level processes, each pinned to a separate NUMA node with its own copy of the model and data. These approaches offer different tradeoffs. +A single daal4py process uses internal threading (TBB) to parallelize across available cores. Alternatively, you can run multiple independent OS-level processes, each pinned to a separate NUMA node with its own copy of the model and data. These approaches offer different tradeoffs. Testing on a 4-NUMA-node Intel Xeon Platinum 8592+ (`airline-ohe` dataset, 200K rows, 24 features, 100 trees, `numactl --localalloc`) showed: