collatable


Namecollatable JSON
Version 0.3.1 PyPI version JSON
download
home_pagehttps://github.com/altescy/collatable
SummaryConstructing batched tensors for any machine learning tasks
upload_time2023-02-06 01:01:04
maintainer
docs_urlNone
authoraltescy
requires_python>=3.8.1,<4.0
licenseMIT
keywords python machine learning
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI No Travis.
coveralls test coverage No coveralls.
            # Collatable

[![Actions Status](https://github.com/altescy/collatable/workflows/CI/badge.svg)](https://github.com/altescy/collatable/actions/workflows/ci.yml)
[![License](https://img.shields.io/github/license/altescy/collatable)](https://github.com/altescy/collatable/blob/main/LICENSE)
[![Python version](https://img.shields.io/pypi/pyversions/collatable)](https://github.com/altescy/collatable)
[![pypi version](https://img.shields.io/pypi/v/collatable)](https://pypi.org/project/collatable/)

Constructing batched tensors for any machine learning tasks

## Installation

```bash
pip install collatable
```

## Examples

The following scripts show how to tokenize/index/collate your dataset with `collatable`:

### Text Classification

```python
import collatable
from collatable import Instance, LabelField, MetadataField, TextField
from collatable.extras.indexer import LabelIndexer, TokenIndexer

dataset = [
    ("this is awesome", "positive"),
    ("this is a bad movie", "negative"),
    ("this movie is an awesome movie", "positive"),
    ("this movie is too bad to watch", "negative"),
]

# Set up indexers for tokens and labels
PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"
token_indexer = TokenIndexer[str](specials=[PAD_TOKEN, UNK_TOKEN], default=UNK_TOKEN)
label_indexer = LabelIndexer[str]()

# Load training dataset
instances = []
with token_indexer.context(train=True), label_indexer.context(train=True):
    for id_, (text, label) in enumerate(dataset):
        # Prepare each field with the corresponding field class
        text_field = TextField(
            text.split(),
            indexer=token_indexer,
            padding_value=token_indexer[PAD_TOKEN],
        )
        label_field = LabelField(
            label,
            indexer=label_indexer,
        )
        metadata_field = MetadataField({"id": id_})
        # Combine these fields into instance
        instance = Instance(
            text=text_field,
            label=label_field,
            metadata=metadata_field,
        )
        instances.append(instance)

# Collate instances and build batch
output = collatable.collate(instances)
print(output)
```

Execution result:

```text
{'metadata': [{'id': 0}, {'id': 1}, {'id': 2}, {'id': 3}],
 'text': {
    'token_ids': array([[ 2,  3,  4,  0,  0,  0,  0],
                        [ 2,  3,  5,  6,  7,  0,  0],
                        [ 2,  7,  3,  8,  4,  7,  0],
                        [ 2,  7,  3,  9,  6, 10, 11]]),
    'mask': array([[ True,  True,  True, False, False, False, False],
                   [ True,  True,  True,  True,  True, False, False],
                   [ True,  True,  True,  True,  True,  True, False],
                   [ True,  True,  True,  True,  True,  True,  True]])},
 'label': array([0, 1, 0, 1], dtype=int32)}
```

### Sequence Labeling

```python
import collatable
from collatable import Instance, SequenceLabelField, TextField
from collatable.extras.indexer import LabelIndexer, TokenIndexer

dataset = [
    (["my", "name", "is", "john", "smith"], ["O", "O", "O", "B", "I"]),
    (["i", "lived", "in", "japan", "three", "years", "ago"], ["O", "O", "O", "U", "O", "O", "O"]),
]

# Set up indexers for tokens and labels
PAD_TOKEN = "<PAD>"
token_indexer = TokenIndexer[str](specials=(PAD_TOKEN,))
label_indexer = LabelIndexer[str]()

# Load training dataset
instances = []
with token_indexer.context(train=True), label_indexer.context(train=True):
    for tokens, labels in dataset:
        text_field = TextField(tokens, indexer=token_indexer, padding_value=token_indexer[PAD_TOKEN])
        label_field = SequenceLabelField(labels, text_field, indexer=label_indexer)
        instance = Instance(text=text_field, label=label_field)
        instances.append(instance)

output = collatable.collate(instances)
print(output)
```

Execution result:

```text
{'label': array([[0, 0, 0, 1, 2, 0, 0],
                 [0, 0, 0, 3, 0, 0, 0]]),
 'text': {
    'token_ids': array([[ 1,  2,  3,  4,  5,  0,  0],
                        [ 6,  7,  8,  9, 10, 11, 12]]),
    'mask': array([[ True,  True,  True,  True,  True, False, False],
                   [ True,  True,  True,  True,  True,  True,  True]])}}
```

### Relation Extraction

```python
import collatable
from collatable.extras.indexer import LabelIndexer, TokenIndexer
from collatable import AdjacencyField, Instance, ListField, SpanField, TextField

PAD_TOKEN = "<PAD>"
token_indexer = TokenIndexer[str](specials=(PAD_TOKEN,))
label_indexer = LabelIndexer[str]()

instances = []
with token_indexer.context(train=True), label_indexer.context(train=True):
    text = TextField(
        ["john", "smith", "was", "born", "in", "new", "york", "and", "now", "lives", "in", "tokyo"],
        indexer=token_indexer,
        padding_value=token_indexer[PAD_TOKEN],
    )
    spans = ListField([SpanField(0, 2, text), SpanField(5, 7, text), SpanField(11, 12, text)])
    relations = AdjacencyField([(0, 1), (0, 2)], spans, labels=["born-in", "lives-in"], indexer=label_indexer)
    instance = Instance(text=text, spans=spans, relations=relations)
    instances.append(instance)

    text = TextField(
        ["tokyo", "is", "the", "capital", "of", "japan"],
        indexer=token_indexer,
        padding_value=token_indexer[PAD_TOKEN],
    )
    spans = ListField([SpanField(0, 1, text), SpanField(5, 6, text)])
    relations = AdjacencyField([(0, 1)], spans, labels=["capital-of"], indexer=label_indexer)
    instance = Instance(text=text, spans=spans, relations=relations)
    instances.append(instance)

output = collatable.collate(instances)
print(output)
```

Execution result:

```text
{'text': {
    'token_ids': array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  5, 11],
                        [11, 12, 13, 14, 15, 16,  0,  0,  0,  0,  0,  0]]),
    'mask': array([[ True,  True,  True,  True,  True,  True,  True,  True,  True, True,  True,  True],
                   [ True,  True,  True,  True,  True,  True, False, False, False, False, False, False]])},
 'spans': array([[[ 0,  2],
                  [ 5,  7],
                  [11, 12]],
                 [[ 0,  1],
                  [ 5,  6],
                  [-1, -1]]]),
 'relations': array([[[-1,  0,  1],
                      [-1, -1, -1],
                      [-1, -1, -1]],
                     [[-1,  2, -1],
                      [-1, -1, -1],
                      [-1, -1, -1]]], dtype=int32)}
```

            

Raw data

            {
    "_id": null,
    "home_page": "https://github.com/altescy/collatable",
    "name": "collatable",
    "maintainer": "",
    "docs_url": null,
    "requires_python": ">=3.8.1,<4.0",
    "maintainer_email": "",
    "keywords": "python,machine learning",
    "author": "altescy",
    "author_email": "altescy@fastmail.com",
    "download_url": "https://files.pythonhosted.org/packages/83/ac/de2548d9a7f47e143e96e010922e4f34b28fa268441ab448c2eae40d5747/collatable-0.3.1.tar.gz",
    "platform": null,
    "description": "# Collatable\n\n[![Actions Status](https://github.com/altescy/collatable/workflows/CI/badge.svg)](https://github.com/altescy/collatable/actions/workflows/ci.yml)\n[![License](https://img.shields.io/github/license/altescy/collatable)](https://github.com/altescy/collatable/blob/main/LICENSE)\n[![Python version](https://img.shields.io/pypi/pyversions/collatable)](https://github.com/altescy/collatable)\n[![pypi version](https://img.shields.io/pypi/v/collatable)](https://pypi.org/project/collatable/)\n\nConstructing batched tensors for any machine learning tasks\n\n## Installation\n\n```bash\npip install collatable\n```\n\n## Examples\n\nThe following scripts show how to tokenize/index/collate your dataset with `collatable`:\n\n### Text Classification\n\n```python\nimport collatable\nfrom collatable import Instance, LabelField, MetadataField, TextField\nfrom collatable.extras.indexer import LabelIndexer, TokenIndexer\n\ndataset = [\n    (\"this is awesome\", \"positive\"),\n    (\"this is a bad movie\", \"negative\"),\n    (\"this movie is an awesome movie\", \"positive\"),\n    (\"this movie is too bad to watch\", \"negative\"),\n]\n\n# Set up indexers for tokens and labels\nPAD_TOKEN = \"<PAD>\"\nUNK_TOKEN = \"<UNK>\"\ntoken_indexer = TokenIndexer[str](specials=[PAD_TOKEN, UNK_TOKEN], default=UNK_TOKEN)\nlabel_indexer = LabelIndexer[str]()\n\n# Load training dataset\ninstances = []\nwith token_indexer.context(train=True), label_indexer.context(train=True):\n    for id_, (text, label) in enumerate(dataset):\n        # Prepare each field with the corresponding field class\n        text_field = TextField(\n            text.split(),\n            indexer=token_indexer,\n            padding_value=token_indexer[PAD_TOKEN],\n        )\n        label_field = LabelField(\n            label,\n            indexer=label_indexer,\n        )\n        metadata_field = MetadataField({\"id\": id_})\n        # Combine these fields into instance\n        instance = Instance(\n            text=text_field,\n            label=label_field,\n            metadata=metadata_field,\n        )\n        instances.append(instance)\n\n# Collate instances and build batch\noutput = collatable.collate(instances)\nprint(output)\n```\n\nExecution result:\n\n```text\n{'metadata': [{'id': 0}, {'id': 1}, {'id': 2}, {'id': 3}],\n 'text': {\n    'token_ids': array([[ 2,  3,  4,  0,  0,  0,  0],\n                        [ 2,  3,  5,  6,  7,  0,  0],\n                        [ 2,  7,  3,  8,  4,  7,  0],\n                        [ 2,  7,  3,  9,  6, 10, 11]]),\n    'mask': array([[ True,  True,  True, False, False, False, False],\n                   [ True,  True,  True,  True,  True, False, False],\n                   [ True,  True,  True,  True,  True,  True, False],\n                   [ True,  True,  True,  True,  True,  True,  True]])},\n 'label': array([0, 1, 0, 1], dtype=int32)}\n```\n\n### Sequence Labeling\n\n```python\nimport collatable\nfrom collatable import Instance, SequenceLabelField, TextField\nfrom collatable.extras.indexer import LabelIndexer, TokenIndexer\n\ndataset = [\n    ([\"my\", \"name\", \"is\", \"john\", \"smith\"], [\"O\", \"O\", \"O\", \"B\", \"I\"]),\n    ([\"i\", \"lived\", \"in\", \"japan\", \"three\", \"years\", \"ago\"], [\"O\", \"O\", \"O\", \"U\", \"O\", \"O\", \"O\"]),\n]\n\n# Set up indexers for tokens and labels\nPAD_TOKEN = \"<PAD>\"\ntoken_indexer = TokenIndexer[str](specials=(PAD_TOKEN,))\nlabel_indexer = LabelIndexer[str]()\n\n# Load training dataset\ninstances = []\nwith token_indexer.context(train=True), label_indexer.context(train=True):\n    for tokens, labels in dataset:\n        text_field = TextField(tokens, indexer=token_indexer, padding_value=token_indexer[PAD_TOKEN])\n        label_field = SequenceLabelField(labels, text_field, indexer=label_indexer)\n        instance = Instance(text=text_field, label=label_field)\n        instances.append(instance)\n\noutput = collatable.collate(instances)\nprint(output)\n```\n\nExecution result:\n\n```text\n{'label': array([[0, 0, 0, 1, 2, 0, 0],\n                 [0, 0, 0, 3, 0, 0, 0]]),\n 'text': {\n    'token_ids': array([[ 1,  2,  3,  4,  5,  0,  0],\n                        [ 6,  7,  8,  9, 10, 11, 12]]),\n    'mask': array([[ True,  True,  True,  True,  True, False, False],\n                   [ True,  True,  True,  True,  True,  True,  True]])}}\n```\n\n### Relation Extraction\n\n```python\nimport collatable\nfrom collatable.extras.indexer import LabelIndexer, TokenIndexer\nfrom collatable import AdjacencyField, Instance, ListField, SpanField, TextField\n\nPAD_TOKEN = \"<PAD>\"\ntoken_indexer = TokenIndexer[str](specials=(PAD_TOKEN,))\nlabel_indexer = LabelIndexer[str]()\n\ninstances = []\nwith token_indexer.context(train=True), label_indexer.context(train=True):\n    text = TextField(\n        [\"john\", \"smith\", \"was\", \"born\", \"in\", \"new\", \"york\", \"and\", \"now\", \"lives\", \"in\", \"tokyo\"],\n        indexer=token_indexer,\n        padding_value=token_indexer[PAD_TOKEN],\n    )\n    spans = ListField([SpanField(0, 2, text), SpanField(5, 7, text), SpanField(11, 12, text)])\n    relations = AdjacencyField([(0, 1), (0, 2)], spans, labels=[\"born-in\", \"lives-in\"], indexer=label_indexer)\n    instance = Instance(text=text, spans=spans, relations=relations)\n    instances.append(instance)\n\n    text = TextField(\n        [\"tokyo\", \"is\", \"the\", \"capital\", \"of\", \"japan\"],\n        indexer=token_indexer,\n        padding_value=token_indexer[PAD_TOKEN],\n    )\n    spans = ListField([SpanField(0, 1, text), SpanField(5, 6, text)])\n    relations = AdjacencyField([(0, 1)], spans, labels=[\"capital-of\"], indexer=label_indexer)\n    instance = Instance(text=text, spans=spans, relations=relations)\n    instances.append(instance)\n\noutput = collatable.collate(instances)\nprint(output)\n```\n\nExecution result:\n\n```text\n{'text': {\n    'token_ids': array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  5, 11],\n                        [11, 12, 13, 14, 15, 16,  0,  0,  0,  0,  0,  0]]),\n    'mask': array([[ True,  True,  True,  True,  True,  True,  True,  True,  True, True,  True,  True],\n                   [ True,  True,  True,  True,  True,  True, False, False, False, False, False, False]])},\n 'spans': array([[[ 0,  2],\n                  [ 5,  7],\n                  [11, 12]],\n                 [[ 0,  1],\n                  [ 5,  6],\n                  [-1, -1]]]),\n 'relations': array([[[-1,  0,  1],\n                      [-1, -1, -1],\n                      [-1, -1, -1]],\n                     [[-1,  2, -1],\n                      [-1, -1, -1],\n                      [-1, -1, -1]]], dtype=int32)}\n```\n",
    "bugtrack_url": null,
    "license": "MIT",
    "summary": "Constructing batched tensors for any machine learning tasks",
    "version": "0.3.1",
    "split_keywords": [
        "python",
        "machine learning"
    ],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "dbd388c2017bd5b731bd0d2db7f18bc76716c7ac07099748cc448dd8da192b29",
                "md5": "0f79dfb14ba87df8ef4be26d343e6695",
                "sha256": "d16464a1f1314b8304212e98819016bbc87bd6c0f1bda10adb614f9c19bd61f1"
            },
            "downloads": -1,
            "filename": "collatable-0.3.1-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "0f79dfb14ba87df8ef4be26d343e6695",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": ">=3.8.1,<4.0",
            "size": 19106,
            "upload_time": "2023-02-06T01:01:02",
            "upload_time_iso_8601": "2023-02-06T01:01:02.966490Z",
            "url": "https://files.pythonhosted.org/packages/db/d3/88c2017bd5b731bd0d2db7f18bc76716c7ac07099748cc448dd8da192b29/collatable-0.3.1-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "83acde2548d9a7f47e143e96e010922e4f34b28fa268441ab448c2eae40d5747",
                "md5": "4c710aca3c7fa149c392176013517ef6",
                "sha256": "41fda2ee649c819bfced1e3ee3bf3e22442c8efa22eedef679f687c807829ca5"
            },
            "downloads": -1,
            "filename": "collatable-0.3.1.tar.gz",
            "has_sig": false,
            "md5_digest": "4c710aca3c7fa149c392176013517ef6",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": ">=3.8.1,<4.0",
            "size": 13610,
            "upload_time": "2023-02-06T01:01:04",
            "upload_time_iso_8601": "2023-02-06T01:01:04.572241Z",
            "url": "https://files.pythonhosted.org/packages/83/ac/de2548d9a7f47e143e96e010922e4f34b28fa268441ab448c2eae40d5747/collatable-0.3.1.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2023-02-06 01:01:04",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "github_user": "altescy",
    "github_project": "collatable",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": true,
    "lcname": "collatable"
}
        
Elapsed time: 0.08689s