# Simple Annotation Framework (SAF)
The Simple Annotation Framework (SAF) is a lightweight Python library for annotating text data. It provides a simple and flexible way to create, manipulate, and export annotations in various formats.

SAF is built upon a minimalistic data model, accessible through its API. This data model is flexible enough to be used by most types of linguistic annotation, and can store other types of data associated to the language items (e.g., statistics, data sources, schemas, etc.)

## Installation
To install SAF, you can use pip:
```bash
pip install saf
```
## Usage
### Importing Text Data
SAF provides importers for different annotated text data formats, including plain text, ConLL and WebAnno.
#### Plain text
```python
from saf.importers.plain import PlainTextImporter
from saf.constants import annotation
from nltk.tokenize import sent_tokenize, word_tokenize
plain_doc = """
They buy and sell books.
I have no clue.
"""
# Import document
plain_importer = PlainTextImporter(sent_tokenize, word_tokenize)
doc = plain_importer.import_document(plain_doc)
print(len(doc.sentences))  # Number of sentences in the document.
print([tok.surface for tok in doc.sentences[1].tokens])  # Listing tokens for the second sentence in the document.
```
#### ConLL
```python
from saf import Document
from saf.constants import annotation
from saf.importers.conll import CoNLLImporter
conll_doc = """
# sent_id = 1
# text = They buy and sell books.
1   They     they    PRON    PRP    Case=Nom|Number=Plur               2   nsubj   2:nsubj|4:nsubj   _
2   buy      buy     VERB    VBP    Number=Plur|Person=3|Tense=Pres    0   root    0:root            _
3   and      and     CCONJ   CC     _                                  4   cc      4:cc              _
4   sell     sell    VERB    VBP    Number=Plur|Person=3|Tense=Pres    2   conj    0:root|2:conj     _
5   books    book    NOUN    NNS    Number=Plur                        2   obj     2:obj|4:obj       SpaceAfter=No
6   .        .       PUNCT   .      _                                  2   punct   2:punct           _
# sent_id = 2
# text = I have no clue.
1   I       I       PRON    PRP   Case=Nom|Number=Sing|Person=1     2   nsubj   _   _
2   have    have    VERB    VBP   Number=Sing|Person=1|Tense=Pres   0   root    _   _
3   no      no      DET     DT    PronType=Neg                      4   det     _   _
4   clue    clue    NOUN    NN    Number=Sing                       2   obj     _   SpaceAfter=No
5   .       .       PUNCT   .     _                                 2   punct   _   _
"""
conll_importer = CoNLLImporter(field_list=[annotation.LEMMA, annotation.UPOS, annotation.POS])
doc = conll_importer.import_document(conll_doc)
print(len(doc.sentences))  # Number of sentences in the document.
print(doc.sentences[0].surface)  # Surface form of the first sentence in the document.
print([tok.annotations[annotation.UPOS] for tok in doc.sentences[1].tokens]) # All universal POS tags from the second sentence.
```
### Annotating Text Data
The [saf_datasets](https://github.com/neuro-symbolic-ai/saf_datasets) library provides various annotated NLP datasets and facilities for automated annotation of your own data.   
### Exporting Annotated Text Data
SAF provides formatters for different annotation formats:
#### ConLL
```python
from saf.importers.plain import PlainTextImporter
from saf.constants import annotation
from nltk.tokenize import sent_tokenize, word_tokenize
from saf.formatters.conll import CoNLLFormatter
plain_doc = """
They buy and sell books.
I have no clue.
"""
# Import document
plain_importer = PlainTextImporter(sent_tokenize, word_tokenize)
doc = plain_importer.import_document(plain_doc)
# Annotate tokens
for sent in doc.sentences:
    for i, token in enumerate(sent.tokens):
        token.annotations[annotation.ID] = str(i)
conll_formatter = CoNLLFormatter(field_list=[annotation.ID])
conll_formatted_doc = conll_formatter.dumps(doc)
print(conll_formatted_doc)
```
### Working with vocabularies
Vocabulary objects can be used to quickly index and manage symbols in documents or sentence collections.  They facilitate vectorization for language model training, specially with label supervision. 
```python
from saf import Document
from saf.constants import annotation
from saf.importers.conll import CoNLLImporter
from saf import Vocabulary
conll_doc = """
# sent_id = 1
# text = They buy and sell books.
1   They     they    PRON    PRP    Case=Nom|Number=Plur               2   nsubj   2:nsubj|4:nsubj   _
2   buy      buy     VERB    VBP    Number=Plur|Person=3|Tense=Pres    0   root    0:root            _
3   and      and     CCONJ   CC     _                                  4   cc      4:cc              _
4   sell     sell    VERB    VBP    Number=Plur|Person=3|Tense=Pres    2   conj    0:root|2:conj     _
5   books    book    NOUN    NNS    Number=Plur                        2   obj     2:obj|4:obj       SpaceAfter=No
6   .        .       PUNCT   .      _                                  2   punct   2:punct           _
# sent_id = 2
# text = I have no clue.
1   I       I       PRON    PRP   Case=Nom|Number=Sing|Person=1     2   nsubj   _   _
2   have    have    VERB    VBP   Number=Sing|Person=1|Tense=Pres   0   root    _   _
3   no      no      DET     DT    PronType=Neg                      4   det     _   _
4   clue    clue    NOUN    NN    Number=Sing                       2   obj     _   SpaceAfter=No
5   .       .       PUNCT   .     _                                 2   punct   _   _
"""
conll_importer = CoNLLImporter(field_list=[annotation.LEMMA, annotation.UPOS, annotation.POS])
doc = conll_importer.import_document(conll_doc)
token_vocab = Vocabulary(doc.sentences, lowercase=False)
upos_vocab = Vocabulary(doc.sentences, source="UPOS", lowercase=False)
# Converting sentences to indices for both tokens and annotations
print(token_vocab.to_indices(doc.sentences))
# [[2, 5, 3, 9, 4, 0], [1, 7, 8, 6, 0]]
print(upos_vocab.to_indices(doc.sentences))
# [[3, 5, 0, 5, 2, 4], [3, 5, 1, 2, 4]]
# Retrieving tokens and annotations from indices
token_vocab.get_symbol(4)
# books
upos_vocab.get_symbol(2)
# NOUN
```
## License
This project is licensed under the GNU General Public License Version 3 - see the [LICENSE](https://github.com/dscarvalho/saf/blob/master/LICENSE) file for details.
            
         
        Raw data
        
            {
    "_id": null,
    "home_page": null,
    "name": "saf-nlp",
    "maintainer": null,
    "docs_url": null,
    "requires_python": ">=3.7",
    "maintainer_email": null,
    "keywords": "annotation, nlp",
    "author": "['Danilo S. Carvalho', 'Vu Duc Tran']",
    "author_email": "\"Danilo S. Carvalho\" <danilo.carvalho@manchester.ac.uk>, Vu Duc Tran <vu.tran@jaist.ac.jp>",
    "download_url": "https://files.pythonhosted.org/packages/d4/18/c488e53c834c42965607ff64dfd7fe336fce96293bdd736b69fe0edd8c7b/saf_nlp-0.5.1.tar.gz",
    "platform": null,
    "description": "# Simple Annotation Framework (SAF)\n\nThe Simple Annotation Framework (SAF) is a lightweight Python library for annotating text data. It provides a simple and flexible way to create, manipulate, and export annotations in various formats.\n\n\n\nSAF is built upon a minimalistic data model, accessible through its API. This data model is flexible enough to be used by most types of linguistic annotation, and can store other types of data associated to the language items (e.g., statistics, data sources, schemas, etc.)\n\n\n\n## Installation\n\nTo install SAF, you can use pip:\n\n```bash\npip install saf\n```\n\n## Usage\n\n### Importing Text Data\n\nSAF provides importers for different annotated text data formats, including plain text, ConLL and WebAnno.\n\n#### Plain text\n\n```python\nfrom saf.importers.plain import PlainTextImporter\nfrom saf.constants import annotation\nfrom nltk.tokenize import sent_tokenize, word_tokenize\n\nplain_doc = \"\"\"\nThey buy and sell books.\nI have no clue.\n\"\"\"\n\n# Import document\nplain_importer = PlainTextImporter(sent_tokenize, word_tokenize)\ndoc = plain_importer.import_document(plain_doc)\n\nprint(len(doc.sentences))  # Number of sentences in the document.\nprint([tok.surface for tok in doc.sentences[1].tokens])  # Listing tokens for the second sentence in the document.\n```\n\n#### ConLL\n\n```python\nfrom saf import Document\nfrom saf.constants import annotation\nfrom saf.importers.conll import CoNLLImporter\n\nconll_doc = \"\"\"\n# sent_id = 1\n# text = They buy and sell books.\n1   They     they    PRON    PRP    Case=Nom|Number=Plur               2   nsubj   2:nsubj|4:nsubj   _\n2   buy      buy     VERB    VBP    Number=Plur|Person=3|Tense=Pres    0   root    0:root            _\n3   and      and     CCONJ   CC     _                                  4   cc      4:cc              _\n4   sell     sell    VERB    VBP    Number=Plur|Person=3|Tense=Pres    2   conj    0:root|2:conj     _\n5   books    book    NOUN    NNS    Number=Plur                        2   obj     2:obj|4:obj       SpaceAfter=No\n6   .        .       PUNCT   .      _                                  2   punct   2:punct           _\n\n# sent_id = 2\n# text = I have no clue.\n1   I       I       PRON    PRP   Case=Nom|Number=Sing|Person=1     2   nsubj   _   _\n2   have    have    VERB    VBP   Number=Sing|Person=1|Tense=Pres   0   root    _   _\n3   no      no      DET     DT    PronType=Neg                      4   det     _   _\n4   clue    clue    NOUN    NN    Number=Sing                       2   obj     _   SpaceAfter=No\n5   .       .       PUNCT   .     _                                 2   punct   _   _\n\"\"\"\n\nconll_importer = CoNLLImporter(field_list=[annotation.LEMMA, annotation.UPOS, annotation.POS])\ndoc = conll_importer.import_document(conll_doc)\n\nprint(len(doc.sentences))  # Number of sentences in the document.\nprint(doc.sentences[0].surface)  # Surface form of the first sentence in the document.\nprint([tok.annotations[annotation.UPOS] for tok in doc.sentences[1].tokens]) # All universal POS tags from the second sentence.\n```\n\n\n### Annotating Text Data\n\nThe [saf_datasets](https://github.com/neuro-symbolic-ai/saf_datasets) library provides various annotated NLP datasets and facilities for automated annotation of your own data.   \n\n\n\n### Exporting Annotated Text Data\n\nSAF provides formatters for different annotation formats:\n\n\n#### ConLL\n\n```python\nfrom saf.importers.plain import PlainTextImporter\nfrom saf.constants import annotation\nfrom nltk.tokenize import sent_tokenize, word_tokenize\nfrom saf.formatters.conll import CoNLLFormatter\n\nplain_doc = \"\"\"\nThey buy and sell books.\nI have no clue.\n\"\"\"\n\n# Import document\nplain_importer = PlainTextImporter(sent_tokenize, word_tokenize)\ndoc = plain_importer.import_document(plain_doc)\n\n# Annotate tokens\nfor sent in doc.sentences:\n    for i, token in enumerate(sent.tokens):\n        token.annotations[annotation.ID] = str(i)\n\nconll_formatter = CoNLLFormatter(field_list=[annotation.ID])\nconll_formatted_doc = conll_formatter.dumps(doc)\n\nprint(conll_formatted_doc)\n```\n\n### Working with vocabularies\n\nVocabulary objects can be used to quickly index and manage symbols in documents or sentence collections.  They facilitate vectorization for language model training, specially with label supervision. \n\n```python\nfrom saf import Document\nfrom saf.constants import annotation\nfrom saf.importers.conll import CoNLLImporter\nfrom saf import Vocabulary\n\nconll_doc = \"\"\"\n# sent_id = 1\n# text = They buy and sell books.\n1   They     they    PRON    PRP    Case=Nom|Number=Plur               2   nsubj   2:nsubj|4:nsubj   _\n2   buy      buy     VERB    VBP    Number=Plur|Person=3|Tense=Pres    0   root    0:root            _\n3   and      and     CCONJ   CC     _                                  4   cc      4:cc              _\n4   sell     sell    VERB    VBP    Number=Plur|Person=3|Tense=Pres    2   conj    0:root|2:conj     _\n5   books    book    NOUN    NNS    Number=Plur                        2   obj     2:obj|4:obj       SpaceAfter=No\n6   .        .       PUNCT   .      _                                  2   punct   2:punct           _\n\n# sent_id = 2\n# text = I have no clue.\n1   I       I       PRON    PRP   Case=Nom|Number=Sing|Person=1     2   nsubj   _   _\n2   have    have    VERB    VBP   Number=Sing|Person=1|Tense=Pres   0   root    _   _\n3   no      no      DET     DT    PronType=Neg                      4   det     _   _\n4   clue    clue    NOUN    NN    Number=Sing                       2   obj     _   SpaceAfter=No\n5   .       .       PUNCT   .     _                                 2   punct   _   _\n\"\"\"\n\nconll_importer = CoNLLImporter(field_list=[annotation.LEMMA, annotation.UPOS, annotation.POS])\ndoc = conll_importer.import_document(conll_doc)\n\ntoken_vocab = Vocabulary(doc.sentences, lowercase=False)\nupos_vocab = Vocabulary(doc.sentences, source=\"UPOS\", lowercase=False)\n\n# Converting sentences to indices for both tokens and annotations\nprint(token_vocab.to_indices(doc.sentences))\n# [[2, 5, 3, 9, 4, 0], [1, 7, 8, 6, 0]]\n\nprint(upos_vocab.to_indices(doc.sentences))\n# [[3, 5, 0, 5, 2, 4], [3, 5, 1, 2, 4]]\n\n# Retrieving tokens and annotations from indices\ntoken_vocab.get_symbol(4)\n# books\n\nupos_vocab.get_symbol(2)\n# NOUN\n```\n\n\n## License\n\nThis project is licensed under the GNU General Public License Version 3 - see the [LICENSE](https://github.com/dscarvalho/saf/blob/master/LICENSE) file for details.\n",
    "bugtrack_url": null,
    "license": null,
    "summary": "Simple Annotation Framework",
    "version": "0.5.1",
    "project_urls": {
        "Homepage": "https://github.com/dscarvalho/saf",
        "Issues": "https://github.com/dscarvalho/saf/issues"
    },
    "split_keywords": [
        "annotation",
        " nlp"
    ],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "1140c2b2f3db7f36cbb69b080057cabf1ddaa16990766fee7d1937ab1e143b69",
                "md5": "f2b0b4f506e52c69d2272677b18a44e5",
                "sha256": "2349333324a5039409202d771ce267fc329796b067eb5121080a66e6b1bd6ac0"
            },
            "downloads": -1,
            "filename": "saf_nlp-0.5.1-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "f2b0b4f506e52c69d2272677b18a44e5",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": ">=3.7",
            "size": 34202,
            "upload_time": "2024-05-09T15:25:04",
            "upload_time_iso_8601": "2024-05-09T15:25:04.167654Z",
            "url": "https://files.pythonhosted.org/packages/11/40/c2b2f3db7f36cbb69b080057cabf1ddaa16990766fee7d1937ab1e143b69/saf_nlp-0.5.1-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "d418c488e53c834c42965607ff64dfd7fe336fce96293bdd736b69fe0edd8c7b",
                "md5": "70b218b9c9b175e77e6007895dc00c7e",
                "sha256": "8f8ccc64e2e3dd5c23059d7b9284e1b824b8474aa88aa99a2328dfd5f9aa453e"
            },
            "downloads": -1,
            "filename": "saf_nlp-0.5.1.tar.gz",
            "has_sig": false,
            "md5_digest": "70b218b9c9b175e77e6007895dc00c7e",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": ">=3.7",
            "size": 27135,
            "upload_time": "2024-05-09T15:25:06",
            "upload_time_iso_8601": "2024-05-09T15:25:06.144194Z",
            "url": "https://files.pythonhosted.org/packages/d4/18/c488e53c834c42965607ff64dfd7fe336fce96293bdd736b69fe0edd8c7b/saf_nlp-0.5.1.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2024-05-09 15:25:06",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "dscarvalho",
    "github_project": "saf",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": false,
    "lcname": "saf-nlp"
}