# Collatable
[![Actions Status](https://github.com/altescy/collatable/workflows/CI/badge.svg)](https://github.com/altescy/collatable/actions/workflows/ci.yml)
[![License](https://img.shields.io/github/license/altescy/collatable)](https://github.com/altescy/collatable/blob/main/LICENSE)
[![Python version](https://img.shields.io/pypi/pyversions/collatable)](https://github.com/altescy/collatable)
[![pypi version](https://img.shields.io/pypi/v/collatable)](https://pypi.org/project/collatable/)
Constructing batched tensors for any machine learning tasks
## Installation
```bash
pip install collatable
```
## Examples
The following scripts show how to tokenize/index/collate your dataset with `collatable`:
### Text Classification
```python
import collatable
from collatable import Instance, LabelField, MetadataField, TextField
from collatable.extras.indexer import LabelIndexer, TokenIndexer
dataset = [
("this is awesome", "positive"),
("this is a bad movie", "negative"),
("this movie is an awesome movie", "positive"),
("this movie is too bad to watch", "negative"),
]
# Set up indexers for tokens and labels
PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"
token_indexer = TokenIndexer[str](specials=[PAD_TOKEN, UNK_TOKEN], default=UNK_TOKEN)
label_indexer = LabelIndexer[str]()
# Load training dataset
instances = []
with token_indexer.context(train=True), label_indexer.context(train=True):
for id_, (text, label) in enumerate(dataset):
# Prepare each field with the corresponding field class
text_field = TextField(
text.split(),
indexer=token_indexer,
padding_value=token_indexer[PAD_TOKEN],
)
label_field = LabelField(
label,
indexer=label_indexer,
)
metadata_field = MetadataField({"id": id_})
# Combine these fields into instance
instance = Instance(
text=text_field,
label=label_field,
metadata=metadata_field,
)
instances.append(instance)
# Collate instances and build batch
output = collatable.collate(instances)
print(output)
```
Execution result:
```text
{'metadata': [{'id': 0}, {'id': 1}, {'id': 2}, {'id': 3}],
'text': {
'token_ids': array([[ 2, 3, 4, 0, 0, 0, 0],
[ 2, 3, 5, 6, 7, 0, 0],
[ 2, 7, 3, 8, 4, 7, 0],
[ 2, 7, 3, 9, 6, 10, 11]]),
'mask': array([[ True, True, True, False, False, False, False],
[ True, True, True, True, True, False, False],
[ True, True, True, True, True, True, False],
[ True, True, True, True, True, True, True]])},
'label': array([0, 1, 0, 1], dtype=int32)}
```
### Sequence Labeling
```python
import collatable
from collatable import Instance, SequenceLabelField, TextField
from collatable.extras.indexer import LabelIndexer, TokenIndexer
dataset = [
(["my", "name", "is", "john", "smith"], ["O", "O", "O", "B", "I"]),
(["i", "lived", "in", "japan", "three", "years", "ago"], ["O", "O", "O", "U", "O", "O", "O"]),
]
# Set up indexers for tokens and labels
PAD_TOKEN = "<PAD>"
token_indexer = TokenIndexer[str](specials=(PAD_TOKEN,))
label_indexer = LabelIndexer[str]()
# Load training dataset
instances = []
with token_indexer.context(train=True), label_indexer.context(train=True):
for tokens, labels in dataset:
text_field = TextField(tokens, indexer=token_indexer, padding_value=token_indexer[PAD_TOKEN])
label_field = SequenceLabelField(labels, text_field, indexer=label_indexer)
instance = Instance(text=text_field, label=label_field)
instances.append(instance)
output = collatable.collate(instances)
print(output)
```
Execution result:
```text
{'label': array([[0, 0, 0, 1, 2, 0, 0],
[0, 0, 0, 3, 0, 0, 0]]),
'text': {
'token_ids': array([[ 1, 2, 3, 4, 5, 0, 0],
[ 6, 7, 8, 9, 10, 11, 12]]),
'mask': array([[ True, True, True, True, True, False, False],
[ True, True, True, True, True, True, True]])}}
```
### Relation Extraction
```python
import collatable
from collatable.extras.indexer import LabelIndexer, TokenIndexer
from collatable import AdjacencyField, Instance, ListField, SpanField, TextField
PAD_TOKEN = "<PAD>"
token_indexer = TokenIndexer[str](specials=(PAD_TOKEN,))
label_indexer = LabelIndexer[str]()
instances = []
with token_indexer.context(train=True), label_indexer.context(train=True):
text = TextField(
["john", "smith", "was", "born", "in", "new", "york", "and", "now", "lives", "in", "tokyo"],
indexer=token_indexer,
padding_value=token_indexer[PAD_TOKEN],
)
spans = ListField([SpanField(0, 2, text), SpanField(5, 7, text), SpanField(11, 12, text)])
relations = AdjacencyField([(0, 1), (0, 2)], spans, labels=["born-in", "lives-in"], indexer=label_indexer)
instance = Instance(text=text, spans=spans, relations=relations)
instances.append(instance)
text = TextField(
["tokyo", "is", "the", "capital", "of", "japan"],
indexer=token_indexer,
padding_value=token_indexer[PAD_TOKEN],
)
spans = ListField([SpanField(0, 1, text), SpanField(5, 6, text)])
relations = AdjacencyField([(0, 1)], spans, labels=["capital-of"], indexer=label_indexer)
instance = Instance(text=text, spans=spans, relations=relations)
instances.append(instance)
output = collatable.collate(instances)
print(output)
```
Execution result:
```text
{'text': {
'token_ids': array([[ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 5, 11],
[11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0]]),
'mask': array([[ True, True, True, True, True, True, True, True, True, True, True, True],
[ True, True, True, True, True, True, False, False, False, False, False, False]])},
'spans': array([[[ 0, 2],
[ 5, 7],
[11, 12]],
[[ 0, 1],
[ 5, 6],
[-1, -1]]]),
'relations': array([[[-1, 0, 1],
[-1, -1, -1],
[-1, -1, -1]],
[[-1, 2, -1],
[-1, -1, -1],
[-1, -1, -1]]], dtype=int32)}
```
Raw data
{
"_id": null,
"home_page": "https://github.com/altescy/collatable",
"name": "collatable",
"maintainer": "",
"docs_url": null,
"requires_python": ">=3.8.1,<4.0",
"maintainer_email": "",
"keywords": "python,machine learning",
"author": "altescy",
"author_email": "altescy@fastmail.com",
"download_url": "https://files.pythonhosted.org/packages/83/ac/de2548d9a7f47e143e96e010922e4f34b28fa268441ab448c2eae40d5747/collatable-0.3.1.tar.gz",
"platform": null,
"description": "# Collatable\n\n[![Actions Status](https://github.com/altescy/collatable/workflows/CI/badge.svg)](https://github.com/altescy/collatable/actions/workflows/ci.yml)\n[![License](https://img.shields.io/github/license/altescy/collatable)](https://github.com/altescy/collatable/blob/main/LICENSE)\n[![Python version](https://img.shields.io/pypi/pyversions/collatable)](https://github.com/altescy/collatable)\n[![pypi version](https://img.shields.io/pypi/v/collatable)](https://pypi.org/project/collatable/)\n\nConstructing batched tensors for any machine learning tasks\n\n## Installation\n\n```bash\npip install collatable\n```\n\n## Examples\n\nThe following scripts show how to tokenize/index/collate your dataset with `collatable`:\n\n### Text Classification\n\n```python\nimport collatable\nfrom collatable import Instance, LabelField, MetadataField, TextField\nfrom collatable.extras.indexer import LabelIndexer, TokenIndexer\n\ndataset = [\n (\"this is awesome\", \"positive\"),\n (\"this is a bad movie\", \"negative\"),\n (\"this movie is an awesome movie\", \"positive\"),\n (\"this movie is too bad to watch\", \"negative\"),\n]\n\n# Set up indexers for tokens and labels\nPAD_TOKEN = \"<PAD>\"\nUNK_TOKEN = \"<UNK>\"\ntoken_indexer = TokenIndexer[str](specials=[PAD_TOKEN, UNK_TOKEN], default=UNK_TOKEN)\nlabel_indexer = LabelIndexer[str]()\n\n# Load training dataset\ninstances = []\nwith token_indexer.context(train=True), label_indexer.context(train=True):\n for id_, (text, label) in enumerate(dataset):\n # Prepare each field with the corresponding field class\n text_field = TextField(\n text.split(),\n indexer=token_indexer,\n padding_value=token_indexer[PAD_TOKEN],\n )\n label_field = LabelField(\n label,\n indexer=label_indexer,\n )\n metadata_field = MetadataField({\"id\": id_})\n # Combine these fields into instance\n instance = Instance(\n text=text_field,\n label=label_field,\n metadata=metadata_field,\n )\n instances.append(instance)\n\n# Collate instances and build batch\noutput = collatable.collate(instances)\nprint(output)\n```\n\nExecution result:\n\n```text\n{'metadata': [{'id': 0}, {'id': 1}, {'id': 2}, {'id': 3}],\n 'text': {\n 'token_ids': array([[ 2, 3, 4, 0, 0, 0, 0],\n [ 2, 3, 5, 6, 7, 0, 0],\n [ 2, 7, 3, 8, 4, 7, 0],\n [ 2, 7, 3, 9, 6, 10, 11]]),\n 'mask': array([[ True, True, True, False, False, False, False],\n [ True, True, True, True, True, False, False],\n [ True, True, True, True, True, True, False],\n [ True, True, True, True, True, True, True]])},\n 'label': array([0, 1, 0, 1], dtype=int32)}\n```\n\n### Sequence Labeling\n\n```python\nimport collatable\nfrom collatable import Instance, SequenceLabelField, TextField\nfrom collatable.extras.indexer import LabelIndexer, TokenIndexer\n\ndataset = [\n ([\"my\", \"name\", \"is\", \"john\", \"smith\"], [\"O\", \"O\", \"O\", \"B\", \"I\"]),\n ([\"i\", \"lived\", \"in\", \"japan\", \"three\", \"years\", \"ago\"], [\"O\", \"O\", \"O\", \"U\", \"O\", \"O\", \"O\"]),\n]\n\n# Set up indexers for tokens and labels\nPAD_TOKEN = \"<PAD>\"\ntoken_indexer = TokenIndexer[str](specials=(PAD_TOKEN,))\nlabel_indexer = LabelIndexer[str]()\n\n# Load training dataset\ninstances = []\nwith token_indexer.context(train=True), label_indexer.context(train=True):\n for tokens, labels in dataset:\n text_field = TextField(tokens, indexer=token_indexer, padding_value=token_indexer[PAD_TOKEN])\n label_field = SequenceLabelField(labels, text_field, indexer=label_indexer)\n instance = Instance(text=text_field, label=label_field)\n instances.append(instance)\n\noutput = collatable.collate(instances)\nprint(output)\n```\n\nExecution result:\n\n```text\n{'label': array([[0, 0, 0, 1, 2, 0, 0],\n [0, 0, 0, 3, 0, 0, 0]]),\n 'text': {\n 'token_ids': array([[ 1, 2, 3, 4, 5, 0, 0],\n [ 6, 7, 8, 9, 10, 11, 12]]),\n 'mask': array([[ True, True, True, True, True, False, False],\n [ True, True, True, True, True, True, True]])}}\n```\n\n### Relation Extraction\n\n```python\nimport collatable\nfrom collatable.extras.indexer import LabelIndexer, TokenIndexer\nfrom collatable import AdjacencyField, Instance, ListField, SpanField, TextField\n\nPAD_TOKEN = \"<PAD>\"\ntoken_indexer = TokenIndexer[str](specials=(PAD_TOKEN,))\nlabel_indexer = LabelIndexer[str]()\n\ninstances = []\nwith token_indexer.context(train=True), label_indexer.context(train=True):\n text = TextField(\n [\"john\", \"smith\", \"was\", \"born\", \"in\", \"new\", \"york\", \"and\", \"now\", \"lives\", \"in\", \"tokyo\"],\n indexer=token_indexer,\n padding_value=token_indexer[PAD_TOKEN],\n )\n spans = ListField([SpanField(0, 2, text), SpanField(5, 7, text), SpanField(11, 12, text)])\n relations = AdjacencyField([(0, 1), (0, 2)], spans, labels=[\"born-in\", \"lives-in\"], indexer=label_indexer)\n instance = Instance(text=text, spans=spans, relations=relations)\n instances.append(instance)\n\n text = TextField(\n [\"tokyo\", \"is\", \"the\", \"capital\", \"of\", \"japan\"],\n indexer=token_indexer,\n padding_value=token_indexer[PAD_TOKEN],\n )\n spans = ListField([SpanField(0, 1, text), SpanField(5, 6, text)])\n relations = AdjacencyField([(0, 1)], spans, labels=[\"capital-of\"], indexer=label_indexer)\n instance = Instance(text=text, spans=spans, relations=relations)\n instances.append(instance)\n\noutput = collatable.collate(instances)\nprint(output)\n```\n\nExecution result:\n\n```text\n{'text': {\n 'token_ids': array([[ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 5, 11],\n [11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 0, 0]]),\n 'mask': array([[ True, True, True, True, True, True, True, True, True, True, True, True],\n [ True, True, True, True, True, True, False, False, False, False, False, False]])},\n 'spans': array([[[ 0, 2],\n [ 5, 7],\n [11, 12]],\n [[ 0, 1],\n [ 5, 6],\n [-1, -1]]]),\n 'relations': array([[[-1, 0, 1],\n [-1, -1, -1],\n [-1, -1, -1]],\n [[-1, 2, -1],\n [-1, -1, -1],\n [-1, -1, -1]]], dtype=int32)}\n```\n",
"bugtrack_url": null,
"license": "MIT",
"summary": "Constructing batched tensors for any machine learning tasks",
"version": "0.3.1",
"split_keywords": [
"python",
"machine learning"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "dbd388c2017bd5b731bd0d2db7f18bc76716c7ac07099748cc448dd8da192b29",
"md5": "0f79dfb14ba87df8ef4be26d343e6695",
"sha256": "d16464a1f1314b8304212e98819016bbc87bd6c0f1bda10adb614f9c19bd61f1"
},
"downloads": -1,
"filename": "collatable-0.3.1-py3-none-any.whl",
"has_sig": false,
"md5_digest": "0f79dfb14ba87df8ef4be26d343e6695",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">=3.8.1,<4.0",
"size": 19106,
"upload_time": "2023-02-06T01:01:02",
"upload_time_iso_8601": "2023-02-06T01:01:02.966490Z",
"url": "https://files.pythonhosted.org/packages/db/d3/88c2017bd5b731bd0d2db7f18bc76716c7ac07099748cc448dd8da192b29/collatable-0.3.1-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "83acde2548d9a7f47e143e96e010922e4f34b28fa268441ab448c2eae40d5747",
"md5": "4c710aca3c7fa149c392176013517ef6",
"sha256": "41fda2ee649c819bfced1e3ee3bf3e22442c8efa22eedef679f687c807829ca5"
},
"downloads": -1,
"filename": "collatable-0.3.1.tar.gz",
"has_sig": false,
"md5_digest": "4c710aca3c7fa149c392176013517ef6",
"packagetype": "sdist",
"python_version": "source",
"requires_python": ">=3.8.1,<4.0",
"size": 13610,
"upload_time": "2023-02-06T01:01:04",
"upload_time_iso_8601": "2023-02-06T01:01:04.572241Z",
"url": "https://files.pythonhosted.org/packages/83/ac/de2548d9a7f47e143e96e010922e4f34b28fa268441ab448c2eae40d5747/collatable-0.3.1.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2023-02-06 01:01:04",
"github": true,
"gitlab": false,
"bitbucket": false,
"github_user": "altescy",
"github_project": "collatable",
"travis_ci": false,
"coveralls": false,
"github_actions": true,
"lcname": "collatable"
}