# Time Smash
Time series clustering and classification suite using notions of *Universal similarity* among data streams, especially without a priori knowledge about the "correct" features to use for time series data.
+ Featurization algorithms: SymbolicDerivative, InferredHMMLikelihood, Csmash
+ Distance measure: LikelihoodDistance
## Example publications
+ Huang, Yi, Victor Rotaru, and Ishanu Chattopadhyay. "Sequence likelihood divergence for fast time series comparison." Knowledge and Information Systems 65, no. 7 (2023): 3079-3098. https://link.springer.com/article/10.1007/s10115-023-01855-0
+ Chattopadhyay, Ishanu, and Hod Lipson. "Data smashing: uncovering lurking order in data." Journal of The Royal Society Interface 11, no. 101 (2014): 20140826.
https://royalsocietypublishing.org/doi/10.1098/rsif.2014.0826
+ Timesmash: Process-Aware Fast Time Series Clustering and Classification https://easychair.org/publications/preprint/qpVv
For questions or suggestions contact:research@paraknowledge.ai
## Usage examples
### SymbolicDerivative
from timesmash import SymbolicDerivative
from sklearn.ensemble import RandomForestClassifier
train = [[1, 0, 1, 0, 1, 0], [1, 1, 0, 1, 1, 0]]
train_label = [[0], [1]]
test = [[0, 1, 1, 0, 1, 1]]
train_features, test_features = SymbolicDerivative().fit_transform(
train=train, test=test, label=train_label
)
clf = RandomForestClassifier().fit(train_features, train_label)
label = clf.predict(test_features)
print("Predicted label: ", label)
### LikelihoodDistance
from timesmash import LikelihoodDistance
from sklearn.cluster import KMeans
train = [[1, 0, 1.1, 0, 11.2, 0], [1, 1, 0, 1, 1, 0], [0, 0.9, 0, 1, 0, 1], [0, 1, 1, 0, 1, 1]]
dist_calc = LikelihoodDistance().fit(train)
dist = dist_calc.produce()
from sklearn.cluster import KMeans
clusters = KMeans(n_clusters = 2).fit(dist).labels_
print("Clusters: " clusters)
### InferredHMMLikelihood
from timesmash import InferredHMMLikelihood
from sklearn.ensemble import RandomForestClassifier
train = [[1, 0, 1, 0, 1, 0], [1, 1, 0, 1, 1, 0]]
train_label = [[0], [1]]
test = [[0, 1, 1, 0, 1, 1]]
train_features, test_features = InferredHMMLikelihood().fit_transform(
train=train, test=test, label=train_label
)
clf = RandomForestClassifier().fit(train_features, train_label)
label = clf.predict(test_features)
print("Predicted label: ", label)
### ClusteredHMMClassifier:
from timesmash import Quantizer, InferredHMMLikelihood, LikelihoodDistance
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
train = pd.DataFrame(
[[1, 0, 1, 0, 1, 0], [1, 1, 0, 1, 1, 0], [1, 0, 1, 0, 1, 0], [1, 1, 0, 1, 1, 0]]
)
train_label = pd.DataFrame([[0], [1], [0], [1]])
test = pd.DataFrame([[0, 1, 1, 0, 1, 1]])
qtz = Quantizer().fit(train, label=train_label)
new_labels = train_label.copy()
for label, dataframe in train_label.groupby(train_label.columns[0]):
dist = LikelihoodDistance(quantizer=qtz).fit(train.loc[dataframe.index]).produce()
sub_labels = KMeans(n_clusters=2, random_state=0).fit(dist).labels_
new_label_names = [str(label) + "_" + str(i) for i in sub_labels]
new_labels.loc[dataframe.index, train_label.columns[0]] = new_label_names
featurizer = InferredHMMLikelihood(quantizer=qtz, epsilon=0.01)
train_features, test_features = featurizer.fit_transform(
train=train, test=test, label=new_labels
)
clf = RandomForestClassifier().fit(train_features, train_label)
print("Predicted label: ", clf.predict(test_features))
### XHMMFeatures for anomaly detection:
import pandas as pd
from timesmash import XHMMFeatures
from sklearn.neighbors import LocalOutlierFactor
channel1_train = pd.DataFrame([[0,1,0,1,0,1,0,1,0,1],[1,0,1,0,1,0,1,0,1,0]], index=['person_1', 'person_2'])
channel2_train = pd.DataFrame([[0,1,0,1,0,1,0,1,0,1],[1,0,1,0,1,0,1,0,1,0]], index=['person_1', 'person_2'])
labels = pd.DataFrame([1,1], index=['person_1', 'person_2'])
alg = XHMMFeatures(n_quantizations=1)
features_train = alg.fit_transform([channel1_train,channel2_train], labels)
clf = LocalOutlierFactor(novelty=True)
clf.fit(features_train)
channel1_test = pd.DataFrame([[0,1,0,1,0,1,0,1,0,1],[1,0,1,0,1,0,1,0,1]], index=['person_test_1', 'person_test_2'])
channel2_test= pd.DataFrame([[0,1,0,1,0,1,0,1,0,1],[0,1,0,1,0,1,0,1,0]], index=['person_test_1', 'person_test_2'])
features_test = alg.transform([channel1_test,channel2_test])
print(clf.predict(features_test))
### XHMMFeatures for classification:
import pandas as pd
from timesmash import XHMMFeatures
from sklearn.ensemble import RandomForestClassifier
d1_train = pd.DataFrame([[0,1,0,1,0,1,0,1,0,1],[1,0,1,0,1,0,1,0,1,0]], index=['person_1', 'person_2'])
d2_train = pd.DataFrame([[1,0,1,0,1,0,1,0,1,0],[1,0,1,0,1,0,1,0,1,0]], index=['person_1', 'person_2'])
labels = pd.DataFrame([0,1], index=['person_1', 'person_2'])
alg = XHMMFeatures(n_quantizations=1)
features_train = alg.fit_transform([d1_train,d2_train], labels)
clf = RandomForestClassifier()
clf.fit(features_train, labels)
d1_test = pd.DataFrame([[1,0,1,0,1,0,1,0,1]], index=['person_test'])
d2_test= pd.DataFrame([[0,1,0,1,0,1,0,1,0]], index=['person_test'])
features_test = alg.transform([d1_test,d2_test])
print(clf.predict(features_test))
### XHMMClustering for multichannel clustering:
import pandas as pd
from timesmash import XHMMClustering
channel1 = pd.DataFrame(
[
[0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
[1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
[1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
[0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
],
index=["person_1", "person_2", "person_3", "person_4"],
)
channel2 = pd.DataFrame(
[
[1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
[1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
[0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
[0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
],
index=["person_1", "person_2", "person_3", "person_4"],
)
alg = XHMMClustering(n_quantizations=1).fit(
[channel1, channel2]
)
clusters = alg.labels_
print(clusters)
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/zeroknowledgediscovery/timesmash/HEAD)
Raw data
{
"_id": null,
"home_page": "https://github.com/zeroknowledgediscovery/timesmash",
"name": "timesmash",
"maintainer": "Victor Rotaru",
"docs_url": null,
"requires_python": null,
"maintainer_email": "virotaru@uchicago.edu",
"keywords": "timeseries",
"author": null,
"author_email": null,
"download_url": "https://files.pythonhosted.org/packages/16/ea/6975cb44e75c397b867eee78591732bff6911c6cb77d957d6849df7a4516/timesmash-0.2.26.tar.gz",
"platform": null,
"description": "# Time Smash\n\nTime series clustering and classification suite using notions of *Universal similarity* among data streams, especially without a priori knowledge about the \"correct\" features to use for time series data.\n\n+ Featurization algorithms: SymbolicDerivative, InferredHMMLikelihood, Csmash\n+ Distance measure: LikelihoodDistance\n\n## Example publications\n\n\n+ Huang, Yi, Victor Rotaru, and Ishanu Chattopadhyay. \"Sequence likelihood divergence for fast time series comparison.\" Knowledge and Information Systems 65, no. 7 (2023): 3079-3098. https://link.springer.com/article/10.1007/s10115-023-01855-0\n\n+ Chattopadhyay, Ishanu, and Hod Lipson. \"Data smashing: uncovering lurking order in data.\" Journal of The Royal Society Interface 11, no. 101 (2014): 20140826.\nhttps://royalsocietypublishing.org/doi/10.1098/rsif.2014.0826\n\n+ Timesmash: Process-Aware Fast Time Series Clustering and Classification https://easychair.org/publications/preprint/qpVv\n\n\nFor questions or suggestions contact:research@paraknowledge.ai\n\n##\tUsage examples\t\n### SymbolicDerivative\n\tfrom timesmash import SymbolicDerivative\n\tfrom sklearn.ensemble import RandomForestClassifier\n\n\ttrain = [[1, 0, 1, 0, 1, 0], [1, 1, 0, 1, 1, 0]]\n\ttrain_label = [[0], [1]]\n\ttest = [[0, 1, 1, 0, 1, 1]]\n\ttrain_features, test_features = SymbolicDerivative().fit_transform(\n\t train=train, test=test, label=train_label\n\t)\n\tclf = RandomForestClassifier().fit(train_features, train_label)\n\tlabel = clf.predict(test_features)\n\tprint(\"Predicted label: \", label)\n\t\n###\tLikelihoodDistance\t\n\tfrom timesmash import LikelihoodDistance\n\tfrom sklearn.cluster import KMeans\n\ttrain = [[1, 0, 1.1, 0, 11.2, 0], [1, 1, 0, 1, 1, 0], [0, 0.9, 0, 1, 0, 1], [0, 1, 1, 0, 1, 1]]\n\tdist_calc = LikelihoodDistance().fit(train)\n\tdist = dist_calc.produce()\n\tfrom sklearn.cluster import KMeans\n\tclusters = KMeans(n_clusters = 2).fit(dist).labels_\n\tprint(\"Clusters: \" clusters)\n\t\n###\tInferredHMMLikelihood\t\n\tfrom timesmash import InferredHMMLikelihood\n\tfrom sklearn.ensemble import RandomForestClassifier\n\n\ttrain = [[1, 0, 1, 0, 1, 0], [1, 1, 0, 1, 1, 0]]\n\ttrain_label = [[0], [1]]\n\ttest = [[0, 1, 1, 0, 1, 1]]\n\ttrain_features, test_features = InferredHMMLikelihood().fit_transform(\n\t train=train, test=test, label=train_label\n\t)\n\tclf = RandomForestClassifier().fit(train_features, train_label)\n\tlabel = clf.predict(test_features)\n\tprint(\"Predicted label: \", label)\n\n###\tClusteredHMMClassifier:\t\n\tfrom timesmash import Quantizer, InferredHMMLikelihood, LikelihoodDistance\n\tfrom sklearn.cluster import KMeans\n\tfrom sklearn.ensemble import RandomForestClassifier\n\timport pandas as pd\n\n\ttrain = pd.DataFrame(\n\t [[1, 0, 1, 0, 1, 0], [1, 1, 0, 1, 1, 0], [1, 0, 1, 0, 1, 0], [1, 1, 0, 1, 1, 0]]\n\t)\n\ttrain_label = pd.DataFrame([[0], [1], [0], [1]])\n\ttest = pd.DataFrame([[0, 1, 1, 0, 1, 1]])\n\n\tqtz = Quantizer().fit(train, label=train_label)\n\tnew_labels = train_label.copy()\n\tfor label, dataframe in train_label.groupby(train_label.columns[0]):\n\t dist = LikelihoodDistance(quantizer=qtz).fit(train.loc[dataframe.index]).produce()\n\t sub_labels = KMeans(n_clusters=2, random_state=0).fit(dist).labels_\n\t new_label_names = [str(label) + \"_\" + str(i) for i in sub_labels]\n\t new_labels.loc[dataframe.index, train_label.columns[0]] = new_label_names\n\n\tfeaturizer = InferredHMMLikelihood(quantizer=qtz, epsilon=0.01)\n\ttrain_features, test_features = featurizer.fit_transform(\n\t train=train, test=test, label=new_labels\n\t)\n\n\tclf = RandomForestClassifier().fit(train_features, train_label)\n\tprint(\"Predicted label: \", clf.predict(test_features))\n\n###\tXHMMFeatures for anomaly detection:\t\n\timport pandas as pd\n\tfrom timesmash import XHMMFeatures\n\tfrom sklearn.neighbors import LocalOutlierFactor\n\n\tchannel1_train = pd.DataFrame([[0,1,0,1,0,1,0,1,0,1],[1,0,1,0,1,0,1,0,1,0]], index=['person_1', 'person_2'])\n\tchannel2_train = pd.DataFrame([[0,1,0,1,0,1,0,1,0,1],[1,0,1,0,1,0,1,0,1,0]], index=['person_1', 'person_2'])\n\tlabels = pd.DataFrame([1,1], index=['person_1', 'person_2'])\n\t \n\talg = XHMMFeatures(n_quantizations=1)\n\tfeatures_train = alg.fit_transform([channel1_train,channel2_train], labels)\n\t \n\tclf = LocalOutlierFactor(novelty=True) \n\tclf.fit(features_train)\n\t \n\tchannel1_test = pd.DataFrame([[0,1,0,1,0,1,0,1,0,1],[1,0,1,0,1,0,1,0,1]], index=['person_test_1', 'person_test_2'])\n\tchannel2_test= pd.DataFrame([[0,1,0,1,0,1,0,1,0,1],[0,1,0,1,0,1,0,1,0]], index=['person_test_1', 'person_test_2'])\n\n\tfeatures_test = alg.transform([channel1_test,channel2_test])\n\tprint(clf.predict(features_test))\n\n###\tXHMMFeatures for classification:\t\n\timport pandas as pd\n\tfrom timesmash import XHMMFeatures\n\tfrom sklearn.ensemble import RandomForestClassifier\n\n\td1_train = pd.DataFrame([[0,1,0,1,0,1,0,1,0,1],[1,0,1,0,1,0,1,0,1,0]], index=['person_1', 'person_2'])\n\td2_train = pd.DataFrame([[1,0,1,0,1,0,1,0,1,0],[1,0,1,0,1,0,1,0,1,0]], index=['person_1', 'person_2'])\n\tlabels = pd.DataFrame([0,1], index=['person_1', 'person_2'])\n\t \n\talg = XHMMFeatures(n_quantizations=1)\n\tfeatures_train = alg.fit_transform([d1_train,d2_train], labels)\n\t \n\tclf = RandomForestClassifier() \n\tclf.fit(features_train, labels)\n\t \n\td1_test = pd.DataFrame([[1,0,1,0,1,0,1,0,1]], index=['person_test'])\n\td2_test= pd.DataFrame([[0,1,0,1,0,1,0,1,0]], index=['person_test'])\n\n\tfeatures_test = alg.transform([d1_test,d2_test])\n\t \n\tprint(clf.predict(features_test))\n\n###\tXHMMClustering for multichannel clustering:\t\n import pandas as pd\n from timesmash import XHMMClustering\n\n channel1 = pd.DataFrame(\n [\n [0, 1, 0, 1, 0, 1, 0, 1, 0, 1],\n [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],\n [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],\n [0, 1, 0, 1, 0, 1, 0, 1, 0, 1],\n ],\n index=[\"person_1\", \"person_2\", \"person_3\", \"person_4\"],\n )\n channel2 = pd.DataFrame(\n [\n [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],\n [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],\n [0, 1, 0, 1, 0, 1, 0, 1, 0, 1],\n [0, 1, 0, 1, 0, 1, 0, 1, 0, 1],\n ],\n index=[\"person_1\", \"person_2\", \"person_3\", \"person_4\"],\n )\n alg = XHMMClustering(n_quantizations=1).fit(\n [channel1, channel2]\n )\n clusters = alg.labels_\n print(clusters)\n\n\t\n[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/zeroknowledgediscovery/timesmash/HEAD)\n",
"bugtrack_url": null,
"license": null,
"summary": "Quantifier of universal similarity amongst arbitrary data streams without a priori knowledge, features, or training.",
"version": "0.2.26",
"project_urls": {
"Homepage": "https://github.com/zeroknowledgediscovery/timesmash"
},
"split_keywords": [
"timeseries"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "72f372e33344cebba8df5570e543aae800cc0f9038f8c6383e791e5f697c5377",
"md5": "699fde4abeca7dc697490e489e082a3c",
"sha256": "8db671cf02517f32b92f240798d072154e6b9d578a0bf10f5d98e551cb093346"
},
"downloads": -1,
"filename": "timesmash-0.2.26-py3-none-any.whl",
"has_sig": false,
"md5_digest": "699fde4abeca7dc697490e489e082a3c",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": null,
"size": 33052635,
"upload_time": "2025-01-15T16:02:20",
"upload_time_iso_8601": "2025-01-15T16:02:20.505906Z",
"url": "https://files.pythonhosted.org/packages/72/f3/72e33344cebba8df5570e543aae800cc0f9038f8c6383e791e5f697c5377/timesmash-0.2.26-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "16ea6975cb44e75c397b867eee78591732bff6911c6cb77d957d6849df7a4516",
"md5": "9a6c3d0159ffac88ee6e128cc526578d",
"sha256": "1c0ee0038552589ec7ab6c6dcf7d1e403451ac7cb0b60cca4f229d785028c9ff"
},
"downloads": -1,
"filename": "timesmash-0.2.26.tar.gz",
"has_sig": false,
"md5_digest": "9a6c3d0159ffac88ee6e128cc526578d",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 32852302,
"upload_time": "2025-01-15T16:02:28",
"upload_time_iso_8601": "2025-01-15T16:02:28.973635Z",
"url": "https://files.pythonhosted.org/packages/16/ea/6975cb44e75c397b867eee78591732bff6911c6cb77d957d6849df7a4516/timesmash-0.2.26.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2025-01-15 16:02:28",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "zeroknowledgediscovery",
"github_project": "timesmash",
"travis_ci": false,
"coveralls": false,
"github_actions": true,
"lcname": "timesmash"
}