# PatX - Pattern eXtraction for Time Series Feature Engineering
[](https://badge.fury.io/py/patx)
[](https://www.python.org/downloads/)
[](https://opensource.org/licenses/MIT)
PatX is a Python package for extracting B-spline patterns from time series data to create features for machine learning models.
It uses Hyperopt optimization to automatically find patterns that work best for your target variable.
**Key Features:**
- Automatic pattern extraction using B-spline curves with 5 control points
- Support for both univariate and multivariate time series
- Flexible input formats (Pandas DataFrames or NumPy arrays)
- Built-in support for classification and regression tasks
- Hyperopt-based optimization for pattern discovery
- Compatible with any scikit-learn compatible model
## Installation
```bash
pip install patx
```
## Quick Start
### Univariate Time Series (Single Input Series)
For a single time series dataset:
```python
import numpy as np
import pandas as pd
from patx import feature_extraction
from patx.data import load_remc_data
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
# Load the included REMC dataset
data = load_remc_data(series=("H3K4me3",))
input_series = data['X_list'][0] # Single array
y = data['y']
print(f"Samples: {len(y)}, time points: {input_series.shape[1]}") # (1841, 40)
# Split data
indices = np.arange(len(y))
train_indices, test_indices = train_test_split(
indices, test_size=0.2, random_state=42, stratify=y
)
# Option 1: Pandas DataFrame (recommended)
input_series_train = pd.DataFrame(input_series[train_indices])
input_series_test = pd.DataFrame(input_series[test_indices])
# Option 2: NumPy array (also works)
# input_series_train = input_series[train_indices]
# input_series_test = input_series[test_indices]
y_train, y_test = pd.Series(y[train_indices]), y[test_indices]
# Extract patterns and train model
result = feature_extraction(
input_series_train=input_series_train,
y_train=y_train,
input_series_test=input_series_test,
n_trials=100,
show_progress=False
)
# Get results
test_probabilities = result['model'].predict_proba(result['test_features'])
auc_score = roc_auc_score(y_test, test_probabilities)
print(f"Univariate: {len(result['patterns'])} patterns, AUC={auc_score:.4f}")
print(f"Features shape: {result['train_features'].shape}")
```
### Multivariate Time Series (Multiple Input Series)
For multiple time series datasets:
```python
from patx import feature_extraction
from patx.data import load_remc_data
# Load multiple input series
data = load_remc_data(series=("H3K4me3", "H3K4me1"))
input_series = data['X_list'] # List of arrays
y = data['y']
series_names = data['series_names']
print(f"Loaded {len(input_series)} input series: {series_names}")
# Split data
indices = np.arange(len(y))
train_indices, test_indices = train_test_split(
indices, test_size=0.2, random_state=42, stratify=y
)
# Option 1: List of Pandas DataFrames (recommended)
input_series_train = [pd.DataFrame(X[train_indices]) for X in input_series]
input_series_test = [pd.DataFrame(X[test_indices]) for X in input_series]
# Option 2: List of NumPy arrays (also works)
# input_series_train = [X[train_indices] for X in input_series]
# input_series_test = [X[test_indices] for X in input_series]
y_train, y_test = y[train_indices], y[test_indices]
# Extract patterns from multiple input series
result = feature_extraction(
input_series_train=input_series_train,
y_train=y_train,
input_series_test=input_series_test,
n_trials=100,
show_progress=False
)
test_probabilities = result['model'].predict_proba(result['test_features'])
auc_score = roc_auc_score(y_test, test_probabilities)
print(f"Multivariate: {len(result['patterns'])} patterns, AUC={auc_score:.4f}")
print(f"Pattern series indices: {[p['series_idx'] for p in result['patterns']]}")
print(f"Features shape: {result['train_features'].shape}")
```
### Using Initial Features
When you have additional features to include alongside pattern features:
```python
# Create some initial features (e.g., statistical features)
def create_statistical_features(X):
return np.column_stack([
np.mean(X, axis=1), # Mean
np.std(X, axis=1), # Standard deviation
np.max(X, axis=1), # Maximum
np.min(X, axis=1), # Minimum
])
# Generate initial features for train and test
initial_features_train = create_statistical_features(input_series_train)
initial_features_test = create_statistical_features(input_series_test)
# Pass initial features to feature_extraction
result = feature_extraction(
input_series_train=input_series_train,
y_train=y_train,
input_series_test=input_series_test,
initial_features=(initial_features_train, initial_features_test),
n_trials=100,
show_progress=False
)
print(f"With initial features: {len(result['patterns'])} patterns")
print(f"Total features shape: {result['train_features'].shape}") # Includes initial + pattern features
```
## Input Data Types
PatX supports multiple input data formats:
### Univariate Input (Single Time Series)
```python
import pandas as pd
import numpy as np
from patx import feature_extraction
# Your time series data (samples × time_points)
your_data = np.random.randn(1000, 50) # 1000 samples, 50 time points
your_test_data = np.random.randn(200, 50)
y_train = np.random.randint(0, 2, 1000) # Example target
# Option 1: Pandas DataFrame (recommended)
input_series_train = pd.DataFrame(your_data)
input_series_test = pd.DataFrame(your_test_data)
# Option 2: NumPy array (also works)
# input_series_train = your_data
# input_series_test = your_test_data
result = feature_extraction(
input_series_train=input_series_train,
y_train=y_train,
input_series_test=input_series_test,
n_trials=100
)
```
### Multivariate Input (Multiple Time Series)
```python
# Multiple time series data
series1 = np.random.randn(1000, 50) # First time series
series2 = np.random.randn(1000, 50) # Second time series
series3 = np.random.randn(1000, 50) # Third time series
# Option 1: List of Pandas DataFrames (recommended)
input_series_train = [
pd.DataFrame(series1[train_indices]),
pd.DataFrame(series2[train_indices]),
pd.DataFrame(series3[train_indices])
]
input_series_test = [
pd.DataFrame(series1[test_indices]),
pd.DataFrame(series2[test_indices]),
pd.DataFrame(series3[test_indices])
]
# Option 2: List of NumPy arrays (also works)
# input_series_train = [series1[train_indices], series2[train_indices], series3[train_indices]]
# input_series_test = [series1[test_indices], series2[test_indices], series3[test_indices]]
result = feature_extraction(
input_series_train=input_series_train,
y_train=y_train,
input_series_test=input_series_test,
n_trials=100
)
# Check which series each pattern came from
print(f"Pattern series indices: {[p['series_idx'] for p in result['patterns']]}")
```
## Pattern Generation
PatX uses B-spline pattern generation with 5 control points. The control points are distributed evenly across the time axis, and only their y-values are optimized to find patterns that work best for your target variable.
## Complete Examples
### Example 1: Univariate with NumPy Arrays
```python
import numpy as np
from patx import feature_extraction
# Generate sample data
np.random.seed(42)
X_train = np.random.randn(1000, 30) # 1000 samples, 30 time points
X_test = np.random.randn(200, 30)
y_train = np.random.randint(0, 2, 1000) # Binary classification
y_test = np.random.randint(0, 2, 200)
# Use NumPy arrays directly
result = feature_extraction(
input_series_train=X_train,
y_train=y_train,
input_series_test=X_test,
n_trials=50,
show_progress=False
)
print(f"Found {len(result['patterns'])} patterns")
print(f"Pattern control points: {result['patterns'][0]['control_points']}")
```
### Example 2: Multivariate with Mixed Data Types
```python
import pandas as pd
import numpy as np
from patx import feature_extraction
# Multiple time series with different data types
series1 = np.random.randn(1000, 25) # NumPy array
series2 = np.random.randn(1000, 25) # NumPy array
series3 = np.random.randn(1000, 25) # NumPy array
# Mix of DataFrames and arrays
input_series_train = [
pd.DataFrame(series1), # DataFrame
series2, # NumPy array
pd.DataFrame(series3) # DataFrame
]
input_series_test = [
pd.DataFrame(series1[800:]), # DataFrame
series2[800:], # NumPy array
pd.DataFrame(series3[800:]) # DataFrame
]
result = feature_extraction(
input_series_train=input_series_train,
y_train=y_train,
input_series_test=input_series_test,
n_trials=100
)
print(f"Pattern series indices: {[p['series_idx'] for p in result['patterns']]}")
print(f"Pattern widths: {[p['width'] for p in result['patterns']]}")
```
### Example 3: With Custom Initial Features
```python
from patx import feature_extraction
from sklearn.preprocessing import StandardScaler
# Create custom initial features
def create_domain_features(X):
"""Create domain-specific features"""
return np.column_stack([
np.mean(X, axis=1), # Mean
np.std(X, axis=1), # Standard deviation
np.max(X, axis=1), # Maximum
np.min(X, axis=1), # Minimum
np.argmax(X, axis=1), # Index of maximum
np.argmin(X, axis=1), # Index of minimum
np.sum(X > 0, axis=1), # Count of positive values
np.sum(X < 0, axis=1), # Count of negative values
])
# Generate initial features
initial_train = create_domain_features(input_series_train)
initial_test = create_domain_features(input_series_test)
# Normalize initial features
scaler = StandardScaler()
initial_train = scaler.fit_transform(initial_train)
initial_test = scaler.transform(initial_test)
# Extract patterns with initial features
result = feature_extraction(
input_series_train=input_series_train,
y_train=y_train,
input_series_test=input_series_test,
initial_features=(initial_train, initial_test),
n_trials=150,
show_progress=True
)
print(f"Total features: {result['train_features'].shape[1]}")
print(f"Initial features: {initial_train.shape[1]}")
print(f"Pattern features: {result['train_features'].shape[1] - initial_train.shape[1]}")
```
### Example 4: Regression Task
```python
# For regression tasks, PatX automatically detects the metric
y_train_reg = np.random.randn(1000) # Continuous target
y_test_reg = np.random.randn(200)
result_reg = feature_extraction(
input_series_train=input_series_train,
y_train=y_train_reg,
input_series_test=input_series_test,
n_trials=100
)
# Get predictions
predictions = result_reg['model'].predict(result_reg['test_features'])
print(f"Regression RMSE: {np.sqrt(np.mean((y_test_reg - predictions)**2)):.4f}")
```
## API Reference
### pattern_to_features
Convert input data to feature values using pattern parameters.
**Parameters:**
- `input_series`: 3D NumPy array (samples × series × time_points)
- `control_points`: List of control point values for B-spline generation
- `pattern_width`: Width of the pattern region
- `pattern_start`: Starting index of the pattern region
- `series_index`: Index of the input series to use (default: 0)
**Returns:**
- NumPy array of feature values (RMSE between pattern and data, one per sample)
**Example:**
```python
from patx import pattern_to_features
control_points = [0.2, 0.5, 0.8, 0.3, 0.1]
features = pattern_to_features(
input_series=X_train,
control_points=control_points,
pattern_width=20,
pattern_start=5,
series_index=0
)
```
### feature_extraction
The main function for extracting patterns from input series data.
**Parameters:**
- `input_series_train`: Training input series data (DataFrame/array for univariate, list of DataFrames/arrays for multivariate)
- `y_train`: Training targets (Series or array)
- `input_series_test`: Test input series data (same structure as `input_series_train`)
- `initial_features`: Optional initial features (array or tuple of train/test arrays)
- `model`: Optional model instance (defaults to LightGBM based on task)
- `metric`: Optional; defaults to 'auc' if None, supports 'auc', 'accuracy', 'rmse'
- `val_size`: Optional validation split ratio (default: 0.2)
- `n_trials`: Maximum number of optimization trials (default: 300)
- `n_control_points`: Number of B-spline control points (default: 5)
- `show_progress`: Show progress bar (default: True)
**Returns:**
A dictionary containing:
- `patterns`: list of pattern dictionaries, each containing:
- `pattern`: B-spline pattern array
- `start`: start index
- `width`: pattern width
- `series_idx`: input series index (for multivariate)
- `control_points`: B-spline control points
- `train_features`: training feature matrix for the ML model
- `test_features`: test feature matrix for the ML model
- `model`: the trained model
### Data
- `load_remc_data(series)`: Load the included REMC epigenomics dataset (multiple input series)
- `series`: tuple of series names to load (default: `("H3K4me3", "H3K4me1")`)
- Returns dictionary with `X_list`, `y`, `X`, and `series_names`
### Custom Models
You can use any model that has `fit()`, `predict()`, and `predict_proba()` methods. Here's an example with sklearn:
**Sklearn Classifier Example:**
```python
from sklearn.linear_model import LogisticRegression
from sklearn.base import clone
class SklearnClassifierWrapper:
def __init__(self, sklearn_model):
self.sklearn_model = sklearn_model
def fit(self, X_train, y_train, X_val=None, y_val=None):
self.sklearn_model.fit(X_train, y_train)
return self
def predict(self, X):
return self.sklearn_model.predict(X)
def predict_proba(self, X):
return self.sklearn_model.predict_proba(X)
def clone(self):
return SklearnClassifierWrapper(clone(self.sklearn_model))
# Use custom model
model = SklearnClassifierWrapper(LogisticRegression())
result = feature_extraction(input_series_train, y_train, input_series_test, model=model)
```
This wrapper works with any sklearn classifier (RandomForest, SVM, etc.).
## Citation
If you use PatX in your research, please cite:
```bibtex
@software{patx,
title={PatX: Pattern eXtraction for Time Series Feature Engineering},
author={Wolber, J.},
year={2025},
url={https://github.com/Prgrmmrjns/patX}
}
```
Raw data
{
"_id": null,
"home_page": null,
"name": "patx",
"maintainer": null,
"docs_url": null,
"requires_python": ">=3.11",
"maintainer_email": "Jonas Wolber <jonascw@web.de>",
"keywords": "time-series, spatial-data, feature-engineering, pattern-extraction, machine-learning, optimization, polynomial-patterns",
"author": null,
"author_email": "Jonas Wolber <jonascw@web.de>",
"download_url": "https://files.pythonhosted.org/packages/b8/a9/e4d434f1a6fa962deb117f3526b5e8baa640853f6d10e8bfb1bc1c90bb82/patx-0.3.4.tar.gz",
"platform": null,
"description": "# PatX - Pattern eXtraction for Time Series Feature Engineering\n\n[](https://badge.fury.io/py/patx)\n[](https://www.python.org/downloads/)\n[](https://opensource.org/licenses/MIT)\n\nPatX is a Python package for extracting B-spline patterns from time series data to create features for machine learning models. \nIt uses Hyperopt optimization to automatically find patterns that work best for your target variable.\n\n**Key Features:**\n- Automatic pattern extraction using B-spline curves with 5 control points\n- Support for both univariate and multivariate time series\n- Flexible input formats (Pandas DataFrames or NumPy arrays)\n- Built-in support for classification and regression tasks\n- Hyperopt-based optimization for pattern discovery\n- Compatible with any scikit-learn compatible model\n\n## Installation\n\n```bash\npip install patx\n```\n\n## Quick Start\n\n### Univariate Time Series (Single Input Series)\n\nFor a single time series dataset:\n\n```python\nimport numpy as np\nimport pandas as pd\nfrom patx import feature_extraction\nfrom patx.data import load_remc_data\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.metrics import roc_auc_score\n\n# Load the included REMC dataset\ndata = load_remc_data(series=(\"H3K4me3\",))\ninput_series = data['X_list'][0] # Single array\ny = data['y']\n\nprint(f\"Samples: {len(y)}, time points: {input_series.shape[1]}\") # (1841, 40)\n\n# Split data\nindices = np.arange(len(y))\ntrain_indices, test_indices = train_test_split(\n indices, test_size=0.2, random_state=42, stratify=y\n)\n\n# Option 1: Pandas DataFrame (recommended)\ninput_series_train = pd.DataFrame(input_series[train_indices])\ninput_series_test = pd.DataFrame(input_series[test_indices])\n\n# Option 2: NumPy array (also works)\n# input_series_train = input_series[train_indices]\n# input_series_test = input_series[test_indices]\n\ny_train, y_test = pd.Series(y[train_indices]), y[test_indices]\n\n# Extract patterns and train model\nresult = feature_extraction(\n input_series_train=input_series_train, \n y_train=y_train, \n input_series_test=input_series_test, \n n_trials=100, \n show_progress=False\n)\n\n# Get results\ntest_probabilities = result['model'].predict_proba(result['test_features'])\nauc_score = roc_auc_score(y_test, test_probabilities)\n\nprint(f\"Univariate: {len(result['patterns'])} patterns, AUC={auc_score:.4f}\")\nprint(f\"Features shape: {result['train_features'].shape}\")\n```\n\n### Multivariate Time Series (Multiple Input Series)\n\nFor multiple time series datasets:\n\n```python\nfrom patx import feature_extraction\nfrom patx.data import load_remc_data\n\n# Load multiple input series\ndata = load_remc_data(series=(\"H3K4me3\", \"H3K4me1\"))\ninput_series = data['X_list'] # List of arrays\ny = data['y']\nseries_names = data['series_names']\n\nprint(f\"Loaded {len(input_series)} input series: {series_names}\")\n\n# Split data\nindices = np.arange(len(y))\ntrain_indices, test_indices = train_test_split(\n indices, test_size=0.2, random_state=42, stratify=y\n)\n\n# Option 1: List of Pandas DataFrames (recommended)\ninput_series_train = [pd.DataFrame(X[train_indices]) for X in input_series]\ninput_series_test = [pd.DataFrame(X[test_indices]) for X in input_series]\n\n# Option 2: List of NumPy arrays (also works)\n# input_series_train = [X[train_indices] for X in input_series]\n# input_series_test = [X[test_indices] for X in input_series]\n\ny_train, y_test = y[train_indices], y[test_indices]\n\n# Extract patterns from multiple input series\nresult = feature_extraction(\n input_series_train=input_series_train, \n y_train=y_train, \n input_series_test=input_series_test, \n n_trials=100, \n show_progress=False\n)\n\ntest_probabilities = result['model'].predict_proba(result['test_features'])\nauc_score = roc_auc_score(y_test, test_probabilities)\n\nprint(f\"Multivariate: {len(result['patterns'])} patterns, AUC={auc_score:.4f}\")\nprint(f\"Pattern series indices: {[p['series_idx'] for p in result['patterns']]}\")\nprint(f\"Features shape: {result['train_features'].shape}\")\n```\n\n### Using Initial Features\n\nWhen you have additional features to include alongside pattern features:\n\n```python\n# Create some initial features (e.g., statistical features)\ndef create_statistical_features(X):\n return np.column_stack([\n np.mean(X, axis=1), # Mean\n np.std(X, axis=1), # Standard deviation\n np.max(X, axis=1), # Maximum\n np.min(X, axis=1), # Minimum\n ])\n\n# Generate initial features for train and test\ninitial_features_train = create_statistical_features(input_series_train)\ninitial_features_test = create_statistical_features(input_series_test)\n\n# Pass initial features to feature_extraction\nresult = feature_extraction(\n input_series_train=input_series_train, \n y_train=y_train, \n input_series_test=input_series_test,\n initial_features=(initial_features_train, initial_features_test),\n n_trials=100, \n show_progress=False\n)\n\nprint(f\"With initial features: {len(result['patterns'])} patterns\")\nprint(f\"Total features shape: {result['train_features'].shape}\") # Includes initial + pattern features\n```\n\n\n## Input Data Types\n\nPatX supports multiple input data formats:\n\n### Univariate Input (Single Time Series)\n\n```python\nimport pandas as pd\nimport numpy as np\nfrom patx import feature_extraction\n\n# Your time series data (samples \u00d7 time_points)\nyour_data = np.random.randn(1000, 50) # 1000 samples, 50 time points\nyour_test_data = np.random.randn(200, 50)\ny_train = np.random.randint(0, 2, 1000) # Example target\n\n# Option 1: Pandas DataFrame (recommended)\ninput_series_train = pd.DataFrame(your_data)\ninput_series_test = pd.DataFrame(your_test_data)\n\n# Option 2: NumPy array (also works)\n# input_series_train = your_data\n# input_series_test = your_test_data\n\nresult = feature_extraction(\n input_series_train=input_series_train, \n y_train=y_train, \n input_series_test=input_series_test, \n n_trials=100\n)\n```\n\n### Multivariate Input (Multiple Time Series)\n\n```python\n# Multiple time series data\nseries1 = np.random.randn(1000, 50) # First time series\nseries2 = np.random.randn(1000, 50) # Second time series\nseries3 = np.random.randn(1000, 50) # Third time series\n\n# Option 1: List of Pandas DataFrames (recommended)\ninput_series_train = [\n pd.DataFrame(series1[train_indices]),\n pd.DataFrame(series2[train_indices]),\n pd.DataFrame(series3[train_indices])\n]\ninput_series_test = [\n pd.DataFrame(series1[test_indices]),\n pd.DataFrame(series2[test_indices]),\n pd.DataFrame(series3[test_indices])\n]\n\n# Option 2: List of NumPy arrays (also works)\n# input_series_train = [series1[train_indices], series2[train_indices], series3[train_indices]]\n# input_series_test = [series1[test_indices], series2[test_indices], series3[test_indices]]\n\nresult = feature_extraction(\n input_series_train=input_series_train, \n y_train=y_train, \n input_series_test=input_series_test, \n n_trials=100\n)\n\n# Check which series each pattern came from\nprint(f\"Pattern series indices: {[p['series_idx'] for p in result['patterns']]}\")\n```\n\n## Pattern Generation\n\nPatX uses B-spline pattern generation with 5 control points. The control points are distributed evenly across the time axis, and only their y-values are optimized to find patterns that work best for your target variable.\n\n## Complete Examples\n\n### Example 1: Univariate with NumPy Arrays\n\n```python\nimport numpy as np\nfrom patx import feature_extraction\n\n# Generate sample data\nnp.random.seed(42)\nX_train = np.random.randn(1000, 30) # 1000 samples, 30 time points\nX_test = np.random.randn(200, 30)\ny_train = np.random.randint(0, 2, 1000) # Binary classification\ny_test = np.random.randint(0, 2, 200)\n\n# Use NumPy arrays directly\nresult = feature_extraction(\n input_series_train=X_train,\n y_train=y_train,\n input_series_test=X_test,\n n_trials=50,\n show_progress=False\n)\n\nprint(f\"Found {len(result['patterns'])} patterns\")\nprint(f\"Pattern control points: {result['patterns'][0]['control_points']}\")\n```\n\n### Example 2: Multivariate with Mixed Data Types\n\n```python\nimport pandas as pd\nimport numpy as np\nfrom patx import feature_extraction\n\n# Multiple time series with different data types\nseries1 = np.random.randn(1000, 25) # NumPy array\nseries2 = np.random.randn(1000, 25) # NumPy array\nseries3 = np.random.randn(1000, 25) # NumPy array\n\n# Mix of DataFrames and arrays\ninput_series_train = [\n pd.DataFrame(series1), # DataFrame\n series2, # NumPy array\n pd.DataFrame(series3) # DataFrame\n]\n\ninput_series_test = [\n pd.DataFrame(series1[800:]), # DataFrame\n series2[800:], # NumPy array\n pd.DataFrame(series3[800:]) # DataFrame\n]\n\nresult = feature_extraction(\n input_series_train=input_series_train,\n y_train=y_train,\n input_series_test=input_series_test,\n n_trials=100\n)\n\nprint(f\"Pattern series indices: {[p['series_idx'] for p in result['patterns']]}\")\nprint(f\"Pattern widths: {[p['width'] for p in result['patterns']]}\")\n```\n\n### Example 3: With Custom Initial Features\n\n```python\nfrom patx import feature_extraction\nfrom sklearn.preprocessing import StandardScaler\n\n# Create custom initial features\ndef create_domain_features(X):\n \"\"\"Create domain-specific features\"\"\"\n return np.column_stack([\n np.mean(X, axis=1), # Mean\n np.std(X, axis=1), # Standard deviation\n np.max(X, axis=1), # Maximum\n np.min(X, axis=1), # Minimum\n np.argmax(X, axis=1), # Index of maximum\n np.argmin(X, axis=1), # Index of minimum\n np.sum(X > 0, axis=1), # Count of positive values\n np.sum(X < 0, axis=1), # Count of negative values\n ])\n\n# Generate initial features\ninitial_train = create_domain_features(input_series_train)\ninitial_test = create_domain_features(input_series_test)\n\n# Normalize initial features\nscaler = StandardScaler()\ninitial_train = scaler.fit_transform(initial_train)\ninitial_test = scaler.transform(initial_test)\n\n# Extract patterns with initial features\nresult = feature_extraction(\n input_series_train=input_series_train,\n y_train=y_train,\n input_series_test=input_series_test,\n initial_features=(initial_train, initial_test),\n n_trials=150,\n show_progress=True\n)\n\nprint(f\"Total features: {result['train_features'].shape[1]}\")\nprint(f\"Initial features: {initial_train.shape[1]}\")\nprint(f\"Pattern features: {result['train_features'].shape[1] - initial_train.shape[1]}\")\n```\n\n### Example 4: Regression Task\n\n```python\n# For regression tasks, PatX automatically detects the metric\ny_train_reg = np.random.randn(1000) # Continuous target\ny_test_reg = np.random.randn(200)\n\nresult_reg = feature_extraction(\n input_series_train=input_series_train,\n y_train=y_train_reg,\n input_series_test=input_series_test,\n n_trials=100\n)\n\n# Get predictions\npredictions = result_reg['model'].predict(result_reg['test_features'])\nprint(f\"Regression RMSE: {np.sqrt(np.mean((y_test_reg - predictions)**2)):.4f}\")\n```\n\n## API Reference\n\n### pattern_to_features\n\nConvert input data to feature values using pattern parameters.\n\n**Parameters:**\n- `input_series`: 3D NumPy array (samples \u00d7 series \u00d7 time_points)\n- `control_points`: List of control point values for B-spline generation\n- `pattern_width`: Width of the pattern region\n- `pattern_start`: Starting index of the pattern region\n- `series_index`: Index of the input series to use (default: 0)\n\n**Returns:**\n- NumPy array of feature values (RMSE between pattern and data, one per sample)\n\n**Example:**\n```python\nfrom patx import pattern_to_features\n\ncontrol_points = [0.2, 0.5, 0.8, 0.3, 0.1]\nfeatures = pattern_to_features(\n input_series=X_train,\n control_points=control_points,\n pattern_width=20,\n pattern_start=5,\n series_index=0\n)\n```\n\n### feature_extraction\n\nThe main function for extracting patterns from input series data.\n\n**Parameters:**\n- `input_series_train`: Training input series data (DataFrame/array for univariate, list of DataFrames/arrays for multivariate)\n- `y_train`: Training targets (Series or array)\n- `input_series_test`: Test input series data (same structure as `input_series_train`)\n- `initial_features`: Optional initial features (array or tuple of train/test arrays)\n- `model`: Optional model instance (defaults to LightGBM based on task)\n- `metric`: Optional; defaults to 'auc' if None, supports 'auc', 'accuracy', 'rmse'\n- `val_size`: Optional validation split ratio (default: 0.2)\n- `n_trials`: Maximum number of optimization trials (default: 300)\n- `n_control_points`: Number of B-spline control points (default: 5)\n- `show_progress`: Show progress bar (default: True)\n\n**Returns:**\nA dictionary containing:\n- `patterns`: list of pattern dictionaries, each containing:\n - `pattern`: B-spline pattern array\n - `start`: start index\n - `width`: pattern width\n - `series_idx`: input series index (for multivariate)\n - `control_points`: B-spline control points\n- `train_features`: training feature matrix for the ML model\n- `test_features`: test feature matrix for the ML model\n- `model`: the trained model\n\n### Data\n\n- `load_remc_data(series)`: Load the included REMC epigenomics dataset (multiple input series)\n - `series`: tuple of series names to load (default: `(\"H3K4me3\", \"H3K4me1\")`)\n - Returns dictionary with `X_list`, `y`, `X`, and `series_names`\n\n### Custom Models\n\nYou can use any model that has `fit()`, `predict()`, and `predict_proba()` methods. Here's an example with sklearn:\n\n**Sklearn Classifier Example:**\n```python\nfrom sklearn.linear_model import LogisticRegression\nfrom sklearn.base import clone\n\nclass SklearnClassifierWrapper:\n def __init__(self, sklearn_model):\n self.sklearn_model = sklearn_model\n \n def fit(self, X_train, y_train, X_val=None, y_val=None):\n self.sklearn_model.fit(X_train, y_train)\n return self\n \n def predict(self, X):\n return self.sklearn_model.predict(X)\n \n def predict_proba(self, X):\n return self.sklearn_model.predict_proba(X)\n \n def clone(self):\n return SklearnClassifierWrapper(clone(self.sklearn_model))\n\n# Use custom model\nmodel = SklearnClassifierWrapper(LogisticRegression())\nresult = feature_extraction(input_series_train, y_train, input_series_test, model=model)\n```\n\nThis wrapper works with any sklearn classifier (RandomForest, SVM, etc.).\n\n## Citation\n\nIf you use PatX in your research, please cite:\n\n```bibtex\n@software{patx,\n title={PatX: Pattern eXtraction for Time Series Feature Engineering},\n author={Wolber, J.},\n year={2025},\n url={https://github.com/Prgrmmrjns/patX}\n}\n```\n",
"bugtrack_url": null,
"license": null,
"summary": "Pattern eXtraction for Time Series and Spatial Data",
"version": "0.3.4",
"project_urls": {
"Repository": "https://github.com/Prgrmmrjns/patX"
},
"split_keywords": [
"time-series",
" spatial-data",
" feature-engineering",
" pattern-extraction",
" machine-learning",
" optimization",
" polynomial-patterns"
],
"urls": [
{
"comment_text": null,
"digests": {
"blake2b_256": "74deda4645425edec1905e0f47beb3740b238544142a84333a5de1207f1fa5e9",
"md5": "77cd278621404eda01f38b326ce5ec2d",
"sha256": "f994c835ef87e8fe26b960589573e87f98b6932563e9f6184f4c8f4bb12ff2c5"
},
"downloads": -1,
"filename": "patx-0.3.4-py3-none-any.whl",
"has_sig": false,
"md5_digest": "77cd278621404eda01f38b326ce5ec2d",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">=3.11",
"size": 286745,
"upload_time": "2025-10-11T04:48:55",
"upload_time_iso_8601": "2025-10-11T04:48:55.425333Z",
"url": "https://files.pythonhosted.org/packages/74/de/da4645425edec1905e0f47beb3740b238544142a84333a5de1207f1fa5e9/patx-0.3.4-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": null,
"digests": {
"blake2b_256": "b8a9e4d434f1a6fa962deb117f3526b5e8baa640853f6d10e8bfb1bc1c90bb82",
"md5": "e686c452e3ea51f9a2312fda5b3675fc",
"sha256": "1ac35d91356cfd88967e6ddf5e67c1213cdf6828f33b0a78e47aa251c6f475b0"
},
"downloads": -1,
"filename": "patx-0.3.4.tar.gz",
"has_sig": false,
"md5_digest": "e686c452e3ea51f9a2312fda5b3675fc",
"packagetype": "sdist",
"python_version": "source",
"requires_python": ">=3.11",
"size": 291900,
"upload_time": "2025-10-11T04:48:56",
"upload_time_iso_8601": "2025-10-11T04:48:56.817920Z",
"url": "https://files.pythonhosted.org/packages/b8/a9/e4d434f1a6fa962deb117f3526b5e8baa640853f6d10e8bfb1bc1c90bb82/patx-0.3.4.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2025-10-11 04:48:56",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "Prgrmmrjns",
"github_project": "patX",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"requirements": [
{
"name": "numpy",
"specs": []
},
{
"name": "pyarrow",
"specs": []
},
{
"name": "scikit-learn",
"specs": []
},
{
"name": "hyperopt",
"specs": []
},
{
"name": "lightgbm",
"specs": []
}
],
"lcname": "patx"
}