# 🐦 pigeonXT - Quickly annotate data in Jupyter Lab
PigeonXT is an extention to the original [Pigeon](https://github.com/agermanidis/pigeon), created by [Anastasis Germanidis](https://pypi.org/user/agermanidis/).
PigeonXT is a simple widget that lets you quickly annotate a dataset of
unlabeled examples from the comfort of your Jupyter notebook.
PigeonXT currently support the following annotation tasks:
- binary / multi-class classification
- multi-label classification
- regression tasks
- captioning tasks
Anything that can be displayed on Jupyter
(text, images, audio, graphs, etc.) can be displayed by pigeon
by providing the appropriate `display_fn` argument.
Additionally, custom hooks can be attached to each row update (`example_process_fn`),
or when the annotating task is complete(`final_process_fn`).
There is a full blog post on the usage of PigeonXT on [Towards Data Science](https://towardsdatascience.com/quickly-label-data-in-jupyter-lab-999e7e455e9e).
### Contributors
- Anastasis Germanidis
- Dennis Bakhuis
- Ritesh Agrawal
- Deepak Tunuguntla
- Bram van Es
## Installation
PigeonXT obviously needs a Jupyter Lab environment. Futhermore, it requires ipywidgets.
The widget itself can be installed using pip:
```bash
pip install pigeonXT-jupyter
```
Currently, it is much easier to install due to Jupyterlab 3:
To run the provided examples in a new environment using Conda:
```bash
conda create --name pigeon python=3.9
conda activate pigeon
pip install numpy pandas jupyterlab ipywidgets pigeonXT-jupyter
```
For an older Jupyterlab or any other trouble, please try the old method:
```bash
conda create --name pigeon python=3.7
conda activate pigeon
conda install nodejs
pip install numpy pandas jupyterlab ipywidgets
jupyter nbextension enable --py widgetsnbextension
jupyter labextension install @jupyter-widgets/jupyterlab-manager
pip install pigeonXT-jupyter
```
Starting Jupyter Lab environment:
```bash
jupyter lab
```
### Development environment
I have moved the development environment to Poetry. To create an identical environment use:
```bash
conda env create -f environment.yml
conda activate pigeonxt
poetry install
pre-commit install
```
## Examples
Examples are also provided in the accompanying notebook.
### Binary or multi-class text classification
Code:
```python
import pandas as pd
import pigeonXT as pixt
annotations = pixt.annotate(
['I love this movie', 'I was really disappointed by the book'],
options=['positive', 'negative', 'inbetween']
)
```
Preview:
![Jupyter notebook multi-class classification](/assets/multiclassexample.png)
### Multi-label text classification
Code:
```python
import pandas as pd
import pigeonXT as pixt
df = pd.DataFrame([
{'example': 'Star wars'},
{'example': 'The Positively True Adventures of the Alleged Texas Cheerleader-Murdering Mom'},
{'example': 'Eternal Sunshine of the Spotless Mind'},
{'example': 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb'},
{'example': 'Killer klowns from outer space'},
])
labels = ['Adventure', 'Romance', 'Fantasy', 'Science fiction', 'Horror', 'Thriller']
annotations = pixt.annotate(
df,
options=labels,
task_type='multilabel-classification',
buttons_in_a_row=3,
reset_buttons_after_click=True,
include_next=True,
include_back=True,
)
```
Preview:
![Jupyter notebook multi-label classification](/assets/multilabelexample.png)
### Image classification
Code:
```python
import pandas as pd
import pigeonXT as pixt
from IPython.display import display, Image
annotations = pixt.annotate(
['assets/img_example1.jpg', 'assets/img_example2.jpg'],
options=['cat', 'dog', 'horse'],
display_fn=lambda filename: display(Image(filename))
)
```
Preview:
![Jupyter notebook multi-label classification](/assets/imagelabelexample.png)
### Audio classification
Code:
```python
import pandas as pd
import pigeonXT as pixt
from IPython.display import Audio
annotations = pixt.annotate(
['assets/audio_1.mp3', 'assets/audio_2.mp3'],
task_type='regression',
options=(1,5,1),
display_fn=lambda filename: display(Audio(filename, autoplay=True))
)
annotations
```
Preview:
![Jupyter notebook multi-label classification](/assets/audiolabelexample.png)
### multi-label text classification with custom hooks
Code:
```python
import pandas as pd
import numpy as np
from pathlib import Path
from pigeonXT import annotate
df = pd.DataFrame([
{'example': 'Star wars'},
{'example': 'The Positively True Adventures of the Alleged Texas Cheerleader-Murdering Mom'},
{'example': 'Eternal Sunshine of the Spotless Mind'},
{'example': 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb'},
{'example': 'Killer klowns from outer space'},
])
labels = ['Adventure', 'Romance', 'Fantasy', 'Science fiction', 'Horror', 'Thriller']
shortLabels = ['A', 'R', 'F', 'SF', 'H', 'T']
df.to_csv('inputtestdata.csv', index=False)
def setLabels(labels, numClasses):
row = np.zeros([numClasses], dtype=np.uint8)
row[labels] = 1
return row
def labelPortion(
inputFile,
labels = ['yes', 'no'],
outputFile='output.csv',
portionSize=2,
textColumn='example',
shortLabels=None,
):
if shortLabels == None:
shortLabels = labels
out = Path(outputFile)
if out.exists():
outdf = pd.read_csv(out)
currentId = outdf.index.max() + 1
else:
currentId = 0
indf = pd.read_csv(inputFile)
examplesInFile = len(indf)
indf = indf.loc[currentId:currentId + portionSize - 1]
actualPortionSize = len(indf)
print(f'{currentId + 1} - {currentId + actualPortionSize} of {examplesInFile}')
sentences = indf[textColumn].tolist()
for label in shortLabels:
indf[label] = None
def updateRow(example, selectedLabels):
print(example, selectedLabels)
labs = setLabels([labels.index(y) for y in selectedLabels], len(labels))
indf.loc[indf[textColumn] == example, shortLabels] = labs
def finalProcessing(annotations):
if out.exists():
prevdata = pd.read_csv(out)
outdata = pd.concat([prevdata, indf]).reset_index(drop=True)
else:
outdata = indf.copy()
outdata.to_csv(out, index=False)
annotated = annotate(
sentences,
options=labels,
task_type='multilabel-classification',
buttons_in_a_row=3,
reset_buttons_after_click=True,
include_next=False,
example_process_fn=updateRow,
final_process_fn=finalProcessing
)
return indf
def getAnnotationsCountPerlabel(annotations, shortLabels):
countPerLabel = pd.DataFrame(columns=shortLabels, index=['count'])
for label in shortLabels:
countPerLabel.loc['count', label] = len(annotations.loc[annotations[label] == 1.0])
return countPerLabel
def getAnnotationsCountPerlabel(annotations, shortLabels):
countPerLabel = pd.DataFrame(columns=shortLabels, index=['count'])
for label in shortLabels:
countPerLabel.loc['count', label] = len(annotations.loc[annotations[label] == 1.0])
return countPerLabel
annotations = labelPortion('inputtestdata.csv',
labels=labels,
shortLabels= shortLabels)
# counts per label
getAnnotationsCountPerlabel(annotations, shortLabels)
```
Preview:
![Jupyter notebook multi-label classification](/assets/pigeonhookfunctions.png)
The complete and runnable examples are available in the provided Notebook.
Raw data
{
"_id": null,
"home_page": "https://github.com/dennisbakhuis/pigeonXT",
"name": "pigeonxt-jupyter",
"maintainer": "",
"docs_url": null,
"requires_python": ">=3.9,<4.0",
"maintainer_email": "",
"keywords": "artificial inteligence,labeling,jupyter,machine learning,data science,data,science",
"author": "Dennis Bakhuis",
"author_email": "pypi@bakhuis.nu",
"download_url": "https://files.pythonhosted.org/packages/4b/6c/a212b35ec09e98d10c71a419a9b39bf7bd37d5265cee71259384fffc449a/pigeonxt_jupyter-0.7.3.tar.gz",
"platform": null,
"description": "# \ud83d\udc26 pigeonXT - Quickly annotate data in Jupyter Lab\nPigeonXT is an extention to the original [Pigeon](https://github.com/agermanidis/pigeon), created by [Anastasis Germanidis](https://pypi.org/user/agermanidis/).\nPigeonXT is a simple widget that lets you quickly annotate a dataset of\nunlabeled examples from the comfort of your Jupyter notebook.\n\nPigeonXT currently support the following annotation tasks:\n- binary / multi-class classification\n- multi-label classification\n- regression tasks\n- captioning tasks\n\nAnything that can be displayed on Jupyter\n(text, images, audio, graphs, etc.) can be displayed by pigeon\nby providing the appropriate `display_fn` argument.\n\nAdditionally, custom hooks can be attached to each row update (`example_process_fn`),\nor when the annotating task is complete(`final_process_fn`).\n\nThere is a full blog post on the usage of PigeonXT on [Towards Data Science](https://towardsdatascience.com/quickly-label-data-in-jupyter-lab-999e7e455e9e).\n\n### Contributors\n- Anastasis Germanidis\n- Dennis Bakhuis\n- Ritesh Agrawal\n- Deepak Tunuguntla\n- Bram van Es\n\n## Installation\nPigeonXT obviously needs a Jupyter Lab environment. Futhermore, it requires ipywidgets.\nThe widget itself can be installed using pip:\n```bash\n pip install pigeonXT-jupyter\n```\n\nCurrently, it is much easier to install due to Jupyterlab 3:\nTo run the provided examples in a new environment using Conda:\n```bash\n conda create --name pigeon python=3.9\n conda activate pigeon\n pip install numpy pandas jupyterlab ipywidgets pigeonXT-jupyter\n```\n\nFor an older Jupyterlab or any other trouble, please try the old method:\n```bash\n conda create --name pigeon python=3.7\n conda activate pigeon\n conda install nodejs\n pip install numpy pandas jupyterlab ipywidgets\n jupyter nbextension enable --py widgetsnbextension\n jupyter labextension install @jupyter-widgets/jupyterlab-manager\n\n pip install pigeonXT-jupyter\n```\n\nStarting Jupyter Lab environment:\n```bash\n jupyter lab\n```\n\n### Development environment\nI have moved the development environment to Poetry. To create an identical environment use:\n```bash\nconda env create -f environment.yml\nconda activate pigeonxt\npoetry install\npre-commit install\n```\n\n## Examples\nExamples are also provided in the accompanying notebook.\n\n### Binary or multi-class text classification\nCode:\n```python\n import pandas as pd\n import pigeonXT as pixt\n\n annotations = pixt.annotate(\n ['I love this movie', 'I was really disappointed by the book'],\n options=['positive', 'negative', 'inbetween']\n )\n```\n\nPreview:\n![Jupyter notebook multi-class classification](/assets/multiclassexample.png)\n\n### Multi-label text classification\nCode:\n```python\n import pandas as pd\n import pigeonXT as pixt\n\n df = pd.DataFrame([\n {'example': 'Star wars'},\n {'example': 'The Positively True Adventures of the Alleged Texas Cheerleader-Murdering Mom'},\n {'example': 'Eternal Sunshine of the Spotless Mind'},\n {'example': 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb'},\n {'example': 'Killer klowns from outer space'},\n ])\n\n labels = ['Adventure', 'Romance', 'Fantasy', 'Science fiction', 'Horror', 'Thriller']\n\n annotations = pixt.annotate(\n df,\n options=labels,\n task_type='multilabel-classification',\n buttons_in_a_row=3,\n reset_buttons_after_click=True,\n include_next=True,\n include_back=True,\n )\n```\n\nPreview:\n![Jupyter notebook multi-label classification](/assets/multilabelexample.png)\n\n### Image classification\nCode:\n```python\n import pandas as pd\n import pigeonXT as pixt\n\n from IPython.display import display, Image\n\n annotations = pixt.annotate(\n ['assets/img_example1.jpg', 'assets/img_example2.jpg'],\n options=['cat', 'dog', 'horse'],\n display_fn=lambda filename: display(Image(filename))\n )\n```\n\nPreview:\n![Jupyter notebook multi-label classification](/assets/imagelabelexample.png)\n\n\n### Audio classification\nCode:\n```python\n import pandas as pd\n import pigeonXT as pixt\n\n from IPython.display import Audio\n\n annotations = pixt.annotate(\n ['assets/audio_1.mp3', 'assets/audio_2.mp3'],\n task_type='regression',\n options=(1,5,1),\n display_fn=lambda filename: display(Audio(filename, autoplay=True))\n )\n\n annotations\n```\n\nPreview:\n![Jupyter notebook multi-label classification](/assets/audiolabelexample.png)\n\n### multi-label text classification with custom hooks\nCode:\n```python\n import pandas as pd\n import numpy as np\n\n from pathlib import Path\n from pigeonXT import annotate\n\n df = pd.DataFrame([\n {'example': 'Star wars'},\n {'example': 'The Positively True Adventures of the Alleged Texas Cheerleader-Murdering Mom'},\n {'example': 'Eternal Sunshine of the Spotless Mind'},\n {'example': 'Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb'},\n {'example': 'Killer klowns from outer space'},\n ])\n\n labels = ['Adventure', 'Romance', 'Fantasy', 'Science fiction', 'Horror', 'Thriller']\n shortLabels = ['A', 'R', 'F', 'SF', 'H', 'T']\n\n df.to_csv('inputtestdata.csv', index=False)\n\n\n def setLabels(labels, numClasses):\n row = np.zeros([numClasses], dtype=np.uint8)\n row[labels] = 1\n return row\n\n def labelPortion(\n inputFile,\n labels = ['yes', 'no'],\n outputFile='output.csv',\n portionSize=2,\n textColumn='example',\n shortLabels=None,\n ):\n if shortLabels == None:\n shortLabels = labels\n\n out = Path(outputFile)\n if out.exists():\n outdf = pd.read_csv(out)\n currentId = outdf.index.max() + 1\n else:\n currentId = 0\n\n indf = pd.read_csv(inputFile)\n examplesInFile = len(indf)\n indf = indf.loc[currentId:currentId + portionSize - 1]\n actualPortionSize = len(indf)\n print(f'{currentId + 1} - {currentId + actualPortionSize} of {examplesInFile}')\n sentences = indf[textColumn].tolist()\n\n for label in shortLabels:\n indf[label] = None\n\n def updateRow(example, selectedLabels):\n print(example, selectedLabels)\n labs = setLabels([labels.index(y) for y in selectedLabels], len(labels))\n indf.loc[indf[textColumn] == example, shortLabels] = labs\n\n def finalProcessing(annotations):\n if out.exists():\n prevdata = pd.read_csv(out)\n outdata = pd.concat([prevdata, indf]).reset_index(drop=True)\n else:\n outdata = indf.copy()\n outdata.to_csv(out, index=False)\n\n annotated = annotate(\n sentences,\n options=labels,\n task_type='multilabel-classification',\n buttons_in_a_row=3,\n reset_buttons_after_click=True,\n include_next=False,\n example_process_fn=updateRow,\n final_process_fn=finalProcessing\n )\n return indf\n\n def getAnnotationsCountPerlabel(annotations, shortLabels):\n\n countPerLabel = pd.DataFrame(columns=shortLabels, index=['count'])\n\n for label in shortLabels:\n countPerLabel.loc['count', label] = len(annotations.loc[annotations[label] == 1.0])\n\n return countPerLabel\n\n def getAnnotationsCountPerlabel(annotations, shortLabels):\n\n countPerLabel = pd.DataFrame(columns=shortLabels, index=['count'])\n\n for label in shortLabels:\n countPerLabel.loc['count', label] = len(annotations.loc[annotations[label] == 1.0])\n\n return countPerLabel\n\n\n annotations = labelPortion('inputtestdata.csv',\n labels=labels,\n shortLabels= shortLabels)\n\n # counts per label\n getAnnotationsCountPerlabel(annotations, shortLabels)\n```\n\nPreview:\n![Jupyter notebook multi-label classification](/assets/pigeonhookfunctions.png)\n\n\nThe complete and runnable examples are available in the provided Notebook.\n",
"bugtrack_url": null,
"license": "Apache 2.0",
"summary": "Quickly annotate data in Jupyter notebooks.",
"version": "0.7.3",
"split_keywords": [
"artificial inteligence",
"labeling",
"jupyter",
"machine learning",
"data science",
"data",
"science"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "f96e379fcffc85ecbe93b32649aa29289972fa0ee44461a4ef323c48b647bd40",
"md5": "c54c218fba04bf9a78620be59968d310",
"sha256": "ce88b18af317ab76752a58e171323766763faf8edd7d7a5a22fb6c6479459545"
},
"downloads": -1,
"filename": "pigeonxt_jupyter-0.7.3-py3-none-any.whl",
"has_sig": false,
"md5_digest": "c54c218fba04bf9a78620be59968d310",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">=3.9,<4.0",
"size": 12819,
"upload_time": "2023-02-02T16:34:05",
"upload_time_iso_8601": "2023-02-02T16:34:05.384760Z",
"url": "https://files.pythonhosted.org/packages/f9/6e/379fcffc85ecbe93b32649aa29289972fa0ee44461a4ef323c48b647bd40/pigeonxt_jupyter-0.7.3-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "4b6ca212b35ec09e98d10c71a419a9b39bf7bd37d5265cee71259384fffc449a",
"md5": "714bc561acae0aff508548ef7909c37b",
"sha256": "012e832463bb9888f609159b51294d3aeeb94ce0d680d482c9fa3734c040f81c"
},
"downloads": -1,
"filename": "pigeonxt_jupyter-0.7.3.tar.gz",
"has_sig": false,
"md5_digest": "714bc561acae0aff508548ef7909c37b",
"packagetype": "sdist",
"python_version": "source",
"requires_python": ">=3.9,<4.0",
"size": 14743,
"upload_time": "2023-02-02T16:34:06",
"upload_time_iso_8601": "2023-02-02T16:34:06.661385Z",
"url": "https://files.pythonhosted.org/packages/4b/6c/a212b35ec09e98d10c71a419a9b39bf7bd37d5265cee71259384fffc449a/pigeonxt_jupyter-0.7.3.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2023-02-02 16:34:06",
"github": true,
"gitlab": false,
"bitbucket": false,
"github_user": "dennisbakhuis",
"github_project": "pigeonXT",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"requirements": [],
"lcname": "pigeonxt-jupyter"
}