# xRFM - Recursive Feature Machines optimized for tabular data
**xRFM** is a scalable implementation of Recursive Feature Machines (RFMs) optimized for tabular data. This library provides both the core RFM algorithm and a tree-based extension (xRFM) that enables efficient processing of large datasets through recursive data splitting.
## Core Components
```
xRFM/
├── xrfm/
│ ├── xrfm.py # Main xRFM class (tree-based)
│ ├── tree_utils.py # Tree manipulation utilities
│ └── rfm_src/
│ ├── recursive_feature_machine.py # Base RFM class
│ ├── kernels.py # Kernel implementations
│ ├── eigenpro.py # EigenPro optimization
│ ├── utils.py # Utility functions
│ ├── svd.py # SVD operations
│ └── gpu_utils.py # GPU memory management
├── examples/ # Usage examples
└── setup.py # Package configuration
```
## Installation
```bash
pip install xrfm
```
Or to use the KermacProductLaplaceKernel, with CUDA-11 or CUDA-12:
```bash
pip install xrfm[cu11]
```
or
```bash
pip install xrfm[cu12]
```
### Development Installation
```bash
git clone https://github.com/dmbeaglehole/xRFM.git
cd xRFM
pip install -e .
```
## Quick Start
### Basic Usage
```python
import torch
from xrfm import xRFM
from sklearn.model_selection import train_test_split
# Create synthetic data
def target_function(X):
return torch.cat([
(X[:, 0] > 0)[:, None],
(X[:, 1] < 0.5)[:, None]
], dim=1).float()
# Setup device and model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = xRFM(device=device, tuning_metric='mse')
# Generate data
n_samples = 2000
n_features = 100
X = torch.randn(n_samples, n_features, device=device)
y = target_function(X)
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.2, random_state=0)
model.fit(X_train, y_train, X_val, y_val)
y_pred_test = model.predict(X_test)
```
### Custom Configuration
```python
# Custom RFM parameters
rfm_params = {
'model': {
'kernel': 'l2', # Kernel type
'bandwidth': 5.0, # Kernel bandwidth
'exponent': 1.0, # Kernel exponent
'diag': False, # Diagonal Mahalanobis matrix
'bandwidth_mode': 'constant'
},
'fit': {
'reg': 1e-3, # Regularization parameter
'iters': 5, # Number of iterations
'M_batch_size': 1000, # Batch size for AGOP
'verbose': True, # Verbose output
'early_stop_rfm': True # Early stopping
}
}
# Initialize model with custom parameters
model = xRFM(
rfm_params=rfm_params,
device=device,
min_subset_size=10000, # Minimum subset size for splitting
tuning_metric='accuracy', # Tuning metric
split_method='top_vector_agop_on_subset' # Splitting strategy
)
```
## Recommended Preprocessing
- **Standardize numerical columns** using a scaler (e.g., `StandardScaler`).
- **One-hot encode categorical columns** and pass their metadata via `categorical_info`.
- **Do not standardize one-hot categorical features.** Use identity matrices for `categorical_vectors`.
### Example (scikit-learn)
```python
import numpy as np
import torch
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
# Assume a pandas DataFrame `df` with:
# - numerical feature columns in `num_cols`
# - categorical feature columns in `cat_cols`
# - target column name in `target_col`
# Split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=0)
train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=0)
# Fit preprocessors on train only
scaler = StandardScaler()
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
X_num_train = scaler.fit_transform(train_df[num_cols])
X_num_val = scaler.transform(val_df[num_cols])
X_num_test = scaler.transform(test_df[num_cols])
X_cat_train = ohe.fit_transform(train_df[cat_cols])
X_cat_val = ohe.transform(val_df[cat_cols])
X_cat_test = ohe.transform(test_df[cat_cols])
# Concatenate: numerical block first, then categorical block
X_train = np.hstack([X_num_train, X_cat_train]).astype(np.float32)
X_val = np.hstack([X_num_val, X_cat_val]).astype(np.float32)
X_test = np.hstack([X_num_test, X_cat_test]).astype(np.float32)
y_train = train_df[target_col].to_numpy().astype(np.float32)
y_val = val_df[target_col].to_numpy().astype(np.float32)
y_test = test_df[target_col].to_numpy().astype(np.float32)
# Build categorical_info (indices are relative to the concatenated X)
n_num = X_num_train.shape[1]
categorical_indices = []
categorical_vectors = []
start = n_num
for cats in ohe.categories_:
cat_len = len(cats)
idxs = torch.arange(start, start + cat_len, dtype=torch.long)
categorical_indices.append(idxs)
categorical_vectors.append(torch.eye(cat_len, dtype=torch.float32)) # identity; do not standardize
start += cat_len
numerical_indices = torch.arange(0, n_num, dtype=torch.long)
categorical_info = dict(
numerical_indices=numerical_indices,
categorical_indices=categorical_indices,
categorical_vectors=categorical_vectors,
)
# Train xRFM with categorical_info
from xrfm import xRFM
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
rfm_params = {
'model': {
'kernel': 'l2',
'bandwidth': 10.0,
'exponent': 1.0,
'diag': False,
'bandwidth_mode': 'constant',
},
'fit': {
'reg': 1e-3,
'iters': 3,
'verbose': False,
'early_stop_rfm': True,
}
}
model = xRFM(
rfm_params=rfm_params,
device=device,
tuning_metric='mse',
categorical_info=categorical_info,
)
model.fit(X_train, y_train, X_val, y_val)
y_pred = model.predict(X_test)
```
## File Structure
### Core Files
| File | Description |
|------|-------------|
| `xrfm/xrfm.py` | Main xRFM class implementing tree-based recursive splitting |
| `xrfm/rfm_src/recursive_feature_machine.py` | Base RFM class with core algorithm |
| `xrfm/rfm_src/kernels.py` | Kernel implementations (Laplace, Product Laplace, etc.) |
| `xrfm/rfm_src/eigenpro.py` | EigenPro optimization for large-scale training |
| `xrfm/rfm_src/utils.py` | Utility functions for matrix operations and metrics |
| `xrfm/rfm_src/svd.py` | SVD utilities for kernel computations |
| `xrfm/rfm_src/gpu_utils.py` | GPU memory management utilities |
| `xrfm/tree_utils.py` | Tree manipulation and parameter extraction utilities |
## API Reference
### Main Classes
#### `xRFM`
Tree-based Recursive Feature Machine for scalable learning.
**Key Methods:**
- `fit(X, y, X_val, y_val)`: Train the model
- `predict(X)`: Make predictions
- `predict_proba(X)`: Predict class probabilities
- `score(X, y)`: Evaluate model performance
#### `RFM`
Base Recursive Feature Machine implementation.
### Available Kernels
| Kernel | String ID | Description |
|--------|-----------|-------------|
| `LaplaceKernel` | `'laplace'`, `'l2'` | Standard Laplace kernel |
| `KermacProductLaplaceKernel` | `'l1_kermac'` | High-performance Product of Laplace kernels on GPU (requires install with `[cu11]` or `[cu12]`) |
| `KermacLpqLaplaceKernel` | `'lpq_kermac'` | High-performance p-norm, q-exponent Laplace kernels on GPU (requires install with `[cu11]` or `[cu12]`) |
| `LightLaplaceKernel` | `'l2_high_dim'`, `'l2_light'` | Memory-efficient Laplace kernel |
| `ProductLaplaceKernel` | `'product_laplace'`, `'l1'` | Product of Laplace kernels (not recommended, use Kermac if possible)|
| `SumPowerLaplaceKernel` | `'sum_power_laplace'`, `'l1_power'` | Sum of powered Laplace kernels |
### Splitting Methods
| Method | Description |
|--------|-------------|
| `'top_vector_agop_on_subset'` | Use top eigenvector of AGOP matrix |
| `'random_agop_on_subset'` | Use random eigenvector of AGOP matrix |
| `'top_pc_agop_on_subset'` | Use top principal component of AGOP |
| `'random_pca'` | Use vector sampled from Gaussian distribution with covariance $X^\top X$|
| `'linear'` | Use linear regression coefficients |
| `'fixed_vector'` | Use fixed projection vector |
### Tuning Metrics (and creating your own custom metrics)
xRFM chooses tuning candidates using the `tuning_metric` string on both tree splits and leaf RFMs. Built-in options are:
- `mse`, `mae` for regression error
- `accuracy`, `brier`, `logloss`, `f1`, `auc` for classification quality
- `top_agop_vector_auc`, `top_agop_vector_pearson_r`, `top_agop_vectors_ols_auc` for AGOP-aware diagnostics
To register a custom metric:
1. Create a new subclass of `Metric` in `xrfm/rfm_src/metrics.py`, fill in the metadata (`name`, `display_name`, `should_maximize`, `task_types`, `required_quantities`), and implement `_compute(**kwargs)` for the quantities you request.
2. Add the class to the `all_metrics` list inside `Metric.from_name` so the factory can return it by name.
3. Reference the new `name` in the `tuning_metric` argument when constructing `xRFM` or the standalone `RFM`.
Each metric receives tensors on the active device; convert to NumPy as needed. Return higher-is-better values when `should_maximize = True`, otherwise lower-is-better.
Raw data
{
"_id": null,
"home_page": null,
"name": "xrfm",
"maintainer": null,
"docs_url": null,
"requires_python": ">=3.9",
"maintainer_email": null,
"keywords": "feature machine, machine learning, recursive, scikit-learn, tabular data, tree-based",
"author": "Daniel Beaglehole, David Holzm\u00fcller",
"author_email": null,
"download_url": "https://files.pythonhosted.org/packages/a6/68/c04f3105ae099fc1a96487b0074ac6f207da725d4e9e9952ef9852099980/xrfm-0.4.2.tar.gz",
"platform": null,
"description": "# xRFM - Recursive Feature Machines optimized for tabular data\n\n\n**xRFM** is a scalable implementation of Recursive Feature Machines (RFMs) optimized for tabular data. This library provides both the core RFM algorithm and a tree-based extension (xRFM) that enables efficient processing of large datasets through recursive data splitting.\n\n## Core Components\n\n```\nxRFM/\n\u251c\u2500\u2500 xrfm/\n\u2502 \u251c\u2500\u2500 xrfm.py # Main xRFM class (tree-based)\n\u2502 \u251c\u2500\u2500 tree_utils.py # Tree manipulation utilities\n\u2502 \u2514\u2500\u2500 rfm_src/\n\u2502 \u251c\u2500\u2500 recursive_feature_machine.py # Base RFM class\n\u2502 \u251c\u2500\u2500 kernels.py # Kernel implementations\n\u2502 \u251c\u2500\u2500 eigenpro.py # EigenPro optimization\n\u2502 \u251c\u2500\u2500 utils.py # Utility functions\n\u2502 \u251c\u2500\u2500 svd.py # SVD operations\n\u2502 \u2514\u2500\u2500 gpu_utils.py # GPU memory management\n\u251c\u2500\u2500 examples/ # Usage examples\n\u2514\u2500\u2500 setup.py # Package configuration\n```\n\n## Installation\n\n```bash\npip install xrfm\n```\n\nOr to use the KermacProductLaplaceKernel, with CUDA-11 or CUDA-12:\n\n```bash\npip install xrfm[cu11]\n```\n\nor \n\n```bash\npip install xrfm[cu12]\n```\n\n### Development Installation\n\n```bash\ngit clone https://github.com/dmbeaglehole/xRFM.git\ncd xRFM\npip install -e .\n```\n\n## Quick Start\n\n### Basic Usage\n\n```python\nimport torch\nfrom xrfm import xRFM\nfrom sklearn.model_selection import train_test_split\n\n# Create synthetic data\ndef target_function(X):\n return torch.cat([\n (X[:, 0] > 0)[:, None], \n (X[:, 1] < 0.5)[:, None]\n ], dim=1).float()\n\n# Setup device and model\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\nmodel = xRFM(device=device, tuning_metric='mse')\n\n# Generate data\nn_samples = 2000\nn_features = 100\nX = torch.randn(n_samples, n_features, device=device)\ny = target_function(X)\nX_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\nX_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval, test_size=0.2, random_state=0)\n\nmodel.fit(X_train, y_train, X_val, y_val)\ny_pred_test = model.predict(X_test)\n```\n\n### Custom Configuration\n\n```python\n# Custom RFM parameters\nrfm_params = {\n 'model': {\n 'kernel': 'l2', # Kernel type\n 'bandwidth': 5.0, # Kernel bandwidth\n 'exponent': 1.0, # Kernel exponent\n 'diag': False, # Diagonal Mahalanobis matrix\n 'bandwidth_mode': 'constant'\n },\n 'fit': {\n 'reg': 1e-3, # Regularization parameter\n 'iters': 5, # Number of iterations\n 'M_batch_size': 1000, # Batch size for AGOP\n 'verbose': True, # Verbose output\n 'early_stop_rfm': True # Early stopping\n }\n}\n\n# Initialize model with custom parameters\nmodel = xRFM(\n rfm_params=rfm_params,\n device=device,\n min_subset_size=10000, # Minimum subset size for splitting\n tuning_metric='accuracy', # Tuning metric\n split_method='top_vector_agop_on_subset' # Splitting strategy\n)\n```\n\n## Recommended Preprocessing\n\n- **Standardize numerical columns** using a scaler (e.g., `StandardScaler`).\n- **One-hot encode categorical columns** and pass their metadata via `categorical_info`.\n- **Do not standardize one-hot categorical features.** Use identity matrices for `categorical_vectors`.\n\n### Example (scikit-learn)\n\n```python\nimport numpy as np\nimport torch\nfrom sklearn.preprocessing import StandardScaler, OneHotEncoder\nfrom sklearn.model_selection import train_test_split\n\n# Assume a pandas DataFrame `df` with:\n# - numerical feature columns in `num_cols`\n# - categorical feature columns in `cat_cols`\n# - target column name in `target_col`\n\n# Split\ntrain_df, test_df = train_test_split(df, test_size=0.2, random_state=0)\ntrain_df, val_df = train_test_split(train_df, test_size=0.2, random_state=0)\n\n# Fit preprocessors on train only\nscaler = StandardScaler()\nohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)\n\nX_num_train = scaler.fit_transform(train_df[num_cols])\nX_num_val = scaler.transform(val_df[num_cols])\nX_num_test = scaler.transform(test_df[num_cols])\n\nX_cat_train = ohe.fit_transform(train_df[cat_cols])\nX_cat_val = ohe.transform(val_df[cat_cols])\nX_cat_test = ohe.transform(test_df[cat_cols])\n\n# Concatenate: numerical block first, then categorical block\nX_train = np.hstack([X_num_train, X_cat_train]).astype(np.float32)\nX_val = np.hstack([X_num_val, X_cat_val]).astype(np.float32)\nX_test = np.hstack([X_num_test, X_cat_test]).astype(np.float32)\n\ny_train = train_df[target_col].to_numpy().astype(np.float32)\ny_val = val_df[target_col].to_numpy().astype(np.float32)\ny_test = test_df[target_col].to_numpy().astype(np.float32)\n\n# Build categorical_info (indices are relative to the concatenated X)\nn_num = X_num_train.shape[1]\ncategorical_indices = []\ncategorical_vectors = []\nstart = n_num\nfor cats in ohe.categories_:\n cat_len = len(cats)\n idxs = torch.arange(start, start + cat_len, dtype=torch.long)\n categorical_indices.append(idxs)\n categorical_vectors.append(torch.eye(cat_len, dtype=torch.float32)) # identity; do not standardize\n start += cat_len\n\nnumerical_indices = torch.arange(0, n_num, dtype=torch.long)\n\ncategorical_info = dict(\n numerical_indices=numerical_indices,\n categorical_indices=categorical_indices,\n categorical_vectors=categorical_vectors,\n)\n\n# Train xRFM with categorical_info\nfrom xrfm import xRFM\ndevice = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n\nrfm_params = {\n 'model': {\n 'kernel': 'l2',\n 'bandwidth': 10.0,\n 'exponent': 1.0,\n 'diag': False,\n 'bandwidth_mode': 'constant',\n },\n 'fit': {\n 'reg': 1e-3,\n 'iters': 3,\n 'verbose': False,\n 'early_stop_rfm': True,\n }\n}\n\nmodel = xRFM(\n rfm_params=rfm_params,\n device=device,\n tuning_metric='mse',\n categorical_info=categorical_info,\n)\n\nmodel.fit(X_train, y_train, X_val, y_val)\ny_pred = model.predict(X_test)\n```\n\n## File Structure\n\n### Core Files\n\n| File | Description |\n|------|-------------|\n| `xrfm/xrfm.py` | Main xRFM class implementing tree-based recursive splitting |\n| `xrfm/rfm_src/recursive_feature_machine.py` | Base RFM class with core algorithm |\n| `xrfm/rfm_src/kernels.py` | Kernel implementations (Laplace, Product Laplace, etc.) |\n| `xrfm/rfm_src/eigenpro.py` | EigenPro optimization for large-scale training |\n| `xrfm/rfm_src/utils.py` | Utility functions for matrix operations and metrics |\n| `xrfm/rfm_src/svd.py` | SVD utilities for kernel computations |\n| `xrfm/rfm_src/gpu_utils.py` | GPU memory management utilities |\n| `xrfm/tree_utils.py` | Tree manipulation and parameter extraction utilities |\n\n\n## API Reference\n\n### Main Classes\n\n#### `xRFM`\nTree-based Recursive Feature Machine for scalable learning.\n\n**Key Methods:**\n- `fit(X, y, X_val, y_val)`: Train the model\n- `predict(X)`: Make predictions\n- `predict_proba(X)`: Predict class probabilities\n- `score(X, y)`: Evaluate model performance\n\n#### `RFM`\nBase Recursive Feature Machine implementation.\n\n### Available Kernels\n\n| Kernel | String ID | Description |\n|--------|-----------|-------------|\n| `LaplaceKernel` | `'laplace'`, `'l2'` | Standard Laplace kernel |\n| `KermacProductLaplaceKernel` | `'l1_kermac'` | High-performance Product of Laplace kernels on GPU (requires install with `[cu11]` or `[cu12]`) |\n| `KermacLpqLaplaceKernel` | `'lpq_kermac'` | High-performance p-norm, q-exponent Laplace kernels on GPU (requires install with `[cu11]` or `[cu12]`) |\n| `LightLaplaceKernel` | `'l2_high_dim'`, `'l2_light'` | Memory-efficient Laplace kernel |\n| `ProductLaplaceKernel` | `'product_laplace'`, `'l1'` | Product of Laplace kernels (not recommended, use Kermac if possible)|\n| `SumPowerLaplaceKernel` | `'sum_power_laplace'`, `'l1_power'` | Sum of powered Laplace kernels |\n\n\n### Splitting Methods\n\n| Method | Description |\n|--------|-------------|\n| `'top_vector_agop_on_subset'` | Use top eigenvector of AGOP matrix |\n| `'random_agop_on_subset'` | Use random eigenvector of AGOP matrix |\n| `'top_pc_agop_on_subset'` | Use top principal component of AGOP |\n| `'random_pca'` | Use vector sampled from Gaussian distribution with covariance $X^\\top X$|\n| `'linear'` | Use linear regression coefficients |\n| `'fixed_vector'` | Use fixed projection vector |\n\n### Tuning Metrics (and creating your own custom metrics)\n\nxRFM chooses tuning candidates using the `tuning_metric` string on both tree splits and leaf RFMs. Built-in options are:\n\n- `mse`, `mae` for regression error\n- `accuracy`, `brier`, `logloss`, `f1`, `auc` for classification quality\n- `top_agop_vector_auc`, `top_agop_vector_pearson_r`, `top_agop_vectors_ols_auc` for AGOP-aware diagnostics\n\nTo register a custom metric:\n\n1. Create a new subclass of `Metric` in `xrfm/rfm_src/metrics.py`, fill in the metadata (`name`, `display_name`, `should_maximize`, `task_types`, `required_quantities`), and implement `_compute(**kwargs)` for the quantities you request.\n2. Add the class to the `all_metrics` list inside `Metric.from_name` so the factory can return it by name.\n3. Reference the new `name` in the `tuning_metric` argument when constructing `xRFM` or the standalone `RFM`.\n\nEach metric receives tensors on the active device; convert to NumPy as needed. Return higher-is-better values when `should_maximize = True`, otherwise lower-is-better.\n",
"bugtrack_url": null,
"license": null,
"summary": "xRFM: Scalable and interpretable kernel methods for tabular data",
"version": "0.4.2",
"project_urls": {
"Documentation": "https://github.com/dmbeaglehole/xrfm#readme",
"Issues": "https://github.com/dmbeaglehole/xrfm/issues",
"Source": "https://github.com/dmbeaglehole/xrfm"
},
"split_keywords": [
"feature machine",
" machine learning",
" recursive",
" scikit-learn",
" tabular data",
" tree-based"
],
"urls": [
{
"comment_text": null,
"digests": {
"blake2b_256": "4d447ab24a80b2f71c1724b3b1e24f869a2e6136f6b745df24a93fa4122bb803",
"md5": "2a74fc3fc782864d5846c2acb5951a6d",
"sha256": "f2ac62c994c86887c0da778bab7a9745adbfd82efaaa9b4d461ab79db156d25f"
},
"downloads": -1,
"filename": "xrfm-0.4.2-py3-none-any.whl",
"has_sig": false,
"md5_digest": "2a74fc3fc782864d5846c2acb5951a6d",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">=3.9",
"size": 54236,
"upload_time": "2025-10-30T04:50:30",
"upload_time_iso_8601": "2025-10-30T04:50:30.337611Z",
"url": "https://files.pythonhosted.org/packages/4d/44/7ab24a80b2f71c1724b3b1e24f869a2e6136f6b745df24a93fa4122bb803/xrfm-0.4.2-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": null,
"digests": {
"blake2b_256": "a668c04f3105ae099fc1a96487b0074ac6f207da725d4e9e9952ef9852099980",
"md5": "a1279fe62757775438f7ae9392869a7d",
"sha256": "fd9ac650bb822661345a0981e22bcbf8ba14bc01fb31dd277309ef73438ffe66"
},
"downloads": -1,
"filename": "xrfm-0.4.2.tar.gz",
"has_sig": false,
"md5_digest": "a1279fe62757775438f7ae9392869a7d",
"packagetype": "sdist",
"python_version": "source",
"requires_python": ">=3.9",
"size": 50093,
"upload_time": "2025-10-30T04:50:29",
"upload_time_iso_8601": "2025-10-30T04:50:29.309315Z",
"url": "https://files.pythonhosted.org/packages/a6/68/c04f3105ae099fc1a96487b0074ac6f207da725d4e9e9952ef9852099980/xrfm-0.4.2.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2025-10-30 04:50:29",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "dmbeaglehole",
"github_project": "xrfm#readme",
"travis_ci": false,
"coveralls": false,
"github_actions": true,
"lcname": "xrfm"
}