flashtext


Nameflashtext JSON
Version 2.7 PyPI version JSON
download
home_pagehttp://github.com/vi3k6i5/flashtext
SummaryExtract/Replaces keywords in sentences.
upload_time2018-02-16 05:24:17
maintainer
docs_urlNone
authorVikash Singh
requires_python
license
keywords
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI
coveralls test coverage
            =========
FlashText
=========

.. image:: https://api.travis-ci.org/vi3k6i5/flashtext.svg?branch=master
   :target: https://travis-ci.org/vi3k6i5/flashtext
   :alt: Build Status

.. image:: https://readthedocs.org/projects/flashtext/badge/?version=latest
   :target: http://flashtext.readthedocs.io/en/latest/?badge=latest
   :alt: Documentation Status

.. image:: https://badge.fury.io/py/flashtext.svg
   :target: https://badge.fury.io/py/flashtext
   :alt: Version

.. image:: https://coveralls.io/repos/github/vi3k6i5/flashtext/badge.svg?branch=master
   :target: https://coveralls.io/github/vi3k6i5/flashtext?branch=master
   :alt: Test coverage

.. image:: https://img.shields.io/github/license/mashape/apistatus.svg?maxAge=2592000
   :target: https://github.com/vi3k6i5/flashtext/blob/master/LICENSE
   :alt: license


This module can be used to replace keywords in sentences or extract keywords from sentences. It is based on the `FlashText algorithm <https://arxiv.org/abs/1711.00046>`_.


Installation
------------
::

    $ pip install flashtext


API doc
-------

Documentation can be found at `FlashText Read the Docs
<http://flashtext.readthedocs.io/>`_.


Usage
-----
Extract keywords
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> # keyword_processor.add_keyword(<unclean name>, <standardised name>)
    >>> keyword_processor.add_keyword('Big Apple', 'New York')
    >>> keyword_processor.add_keyword('Bay Area')
    >>> keywords_found = keyword_processor.extract_keywords('I love Big Apple and Bay Area.')
    >>> keywords_found
    >>> # ['New York', 'Bay Area']

Replace keywords
    >>> keyword_processor.add_keyword('New Delhi', 'NCR region')
    >>> new_sentence = keyword_processor.replace_keywords('I love Big Apple and new delhi.')
    >>> new_sentence
    >>> # 'I love New York and NCR region.'

Case Sensitive example
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor(case_sensitive=True)
    >>> keyword_processor.add_keyword('Big Apple', 'New York')
    >>> keyword_processor.add_keyword('Bay Area')
    >>> keywords_found = keyword_processor.extract_keywords('I love big Apple and Bay Area.')
    >>> keywords_found
    >>> # ['Bay Area']

Span of keywords extracted
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_processor.add_keyword('Big Apple', 'New York')
    >>> keyword_processor.add_keyword('Bay Area')
    >>> keywords_found = keyword_processor.extract_keywords('I love big Apple and Bay Area.', span_info=True)
    >>> keywords_found
    >>> # [('New York', 7, 16), ('Bay Area', 21, 29)]

Get Extra information with keywords extracted
    >>> from flashtext import KeywordProcessor
    >>> kp = KeywordProcessor()
    >>> kp.add_keyword('Taj Mahal', ('Monument', 'Taj Mahal'))
    >>> kp.add_keyword('Delhi', ('Location', 'Delhi'))
    >>> kp.extract_keywords('Taj Mahal is in Delhi.')
    >>> # [('Monument', 'Taj Mahal'), ('Location', 'Delhi')]
    >>> # NOTE: replace_keywords feature won't work with this.

No clean name for Keywords
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_processor.add_keyword('Big Apple')
    >>> keyword_processor.add_keyword('Bay Area')
    >>> keywords_found = keyword_processor.extract_keywords('I love big Apple and Bay Area.')
    >>> keywords_found
    >>> # ['Big Apple', 'Bay Area']

Add Multiple Keywords simultaneously
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_dict = {
    >>>     "java": ["java_2e", "java programing"],
    >>>     "product management": ["PM", "product manager"]
    >>> }
    >>> # {'clean_name': ['list of unclean names']}
    >>> keyword_processor.add_keywords_from_dict(keyword_dict)
    >>> # Or add keywords from a list:
    >>> keyword_processor.add_keywords_from_list(["java", "python"])
    >>> keyword_processor.extract_keywords('I am a product manager for a java_2e platform')
    >>> # output ['product management', 'java']

To Remove keywords
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_dict = {
    >>>     "java": ["java_2e", "java programing"],
    >>>     "product management": ["PM", "product manager"]
    >>> }
    >>> keyword_processor.add_keywords_from_dict(keyword_dict)
    >>> print(keyword_processor.extract_keywords('I am a product manager for a java_2e platform'))
    >>> # output ['product management', 'java']
    >>> keyword_processor.remove_keyword('java_2e')
    >>> # you can also remove keywords from a list/ dictionary
    >>> keyword_processor.remove_keywords_from_dict({"product management": ["PM"]})
    >>> keyword_processor.remove_keywords_from_list(["java programing"])
    >>> keyword_processor.extract_keywords('I am a product manager for a java_2e platform')
    >>> # output ['product management']

To check Number of terms in KeywordProcessor
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_dict = {
    >>>     "java": ["java_2e", "java programing"],
    >>>     "product management": ["PM", "product manager"]
    >>> }
    >>> keyword_processor.add_keywords_from_dict(keyword_dict)
    >>> print(len(keyword_processor))
    >>> # output 4

To check if term is present in KeywordProcessor
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_processor.add_keyword('j2ee', 'Java')
    >>> 'j2ee' in keyword_processor
    >>> # output: True
    >>> keyword_processor.get_keyword('j2ee')
    >>> # output: Java
    >>> keyword_processor['colour'] = 'color'
    >>> keyword_processor['colour']
    >>> # output: color

Get all keywords in dictionary
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_processor.add_keyword('j2ee', 'Java')
    >>> keyword_processor.add_keyword('colour', 'color')
    >>> keyword_processor.get_all_keywords()
    >>> # output: {'colour': 'color', 'j2ee': 'Java'}

For detecting Word Boundary currently any character other than this `\\w` `[A-Za-z0-9_]` is considered a word boundary.

To set or add characters as part of word characters
    >>> from flashtext import KeywordProcessor
    >>> keyword_processor = KeywordProcessor()
    >>> keyword_processor.add_keyword('Big Apple')
    >>> print(keyword_processor.extract_keywords('I love Big Apple/Bay Area.'))
    >>> # ['Big Apple']
    >>> keyword_processor.add_non_word_boundary('/')
    >>> print(keyword_processor.extract_keywords('I love Big Apple/Bay Area.'))
    >>> # []


Test
----
::

    $ git clone https://github.com/vi3k6i5/flashtext
    $ cd flashtext
    $ pip install pytest
    $ python setup.py test


Build Docs
----------
::

    $ git clone https://github.com/vi3k6i5/flashtext
    $ cd flashtext/docs
    $ pip install sphinx
    $ make html
    $ # open _build/html/index.html in browser to view it locally


Why not Regex?
--------------

It's a custom algorithm based on `Aho-Corasick algorithm
<https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm>`_ and `Trie Dictionary
<https://en.wikipedia.org/wiki/Trie Dictionary>`_.

.. image:: https://github.com/vi3k6i5/flashtext/raw/master/benchmark.png
   :target: https://twitter.com/RadimRehurek/status/904989624589803520
   :alt: Benchmark


Time taken by FlashText to find terms in comparison to Regex.

.. image:: https://thepracticaldev.s3.amazonaws.com/i/xruf50n6z1r37ti8rd89.png


Time taken by FlashText to replace terms in comparison to Regex.

.. image:: https://thepracticaldev.s3.amazonaws.com/i/k44ghwp8o712dm58debj.png

Link to code for benchmarking the `Find Feature <https://gist.github.com/vi3k6i5/604eefd92866d081cfa19f862224e4a0>`_ and `Replace Feature <https://gist.github.com/vi3k6i5/dc3335ee46ab9f650b19885e8ade6c7a>`_.

The idea for this library came from the following `StackOverflow question
<https://stackoverflow.com/questions/44178449/regex-replace-is-taking-time-for-millions-of-documents-how-to-make-it-faster>`_.


Citation
----------

The original paper published on `FlashText algorithm <https://arxiv.org/abs/1711.00046>`_.

::

    @ARTICLE{2017arXiv171100046S,
       author = {{Singh}, V.},
        title = "{Replace or Retrieve Keywords In Documents at Scale}",
      journal = {ArXiv e-prints},
    archivePrefix = "arXiv",
       eprint = {1711.00046},
     primaryClass = "cs.DS",
     keywords = {Computer Science - Data Structures and Algorithms},
         year = 2017,
        month = oct,
       adsurl = {http://adsabs.harvard.edu/abs/2017arXiv171100046S},
      adsnote = {Provided by the SAO/NASA Astrophysics Data System}
    }

The article published on `Medium freeCodeCamp <https://medium.freecodecamp.org/regex-was-taking-5-days-flashtext-does-it-in-15-minutes-55f04411025f>`_.


Contribute
----------

- Issue Tracker: https://github.com/vi3k6i5/flashtext/issues
- Source Code: https://github.com/vi3k6i5/flashtext/


License
-------

The project is licensed under the MIT license.
            

Raw data

            {
    "_id": null,
    "home_page": "http://github.com/vi3k6i5/flashtext",
    "name": "flashtext",
    "maintainer": "",
    "docs_url": null,
    "requires_python": "",
    "maintainer_email": "",
    "keywords": "",
    "author": "Vikash Singh",
    "author_email": "vikash.duliajan@gmail.com",
    "download_url": "https://files.pythonhosted.org/packages/81/d8/2cd0656eae456d615c2f1efbcae8dfca2cb871a31f34ba8925aba47d5e09/flashtext-2.7.tar.gz",
    "platform": "any",
    "description": "=========\nFlashText\n=========\n\n.. image:: https://api.travis-ci.org/vi3k6i5/flashtext.svg?branch=master\n   :target: https://travis-ci.org/vi3k6i5/flashtext\n   :alt: Build Status\n\n.. image:: https://readthedocs.org/projects/flashtext/badge/?version=latest\n   :target: http://flashtext.readthedocs.io/en/latest/?badge=latest\n   :alt: Documentation Status\n\n.. image:: https://badge.fury.io/py/flashtext.svg\n   :target: https://badge.fury.io/py/flashtext\n   :alt: Version\n\n.. image:: https://coveralls.io/repos/github/vi3k6i5/flashtext/badge.svg?branch=master\n   :target: https://coveralls.io/github/vi3k6i5/flashtext?branch=master\n   :alt: Test coverage\n\n.. image:: https://img.shields.io/github/license/mashape/apistatus.svg?maxAge=2592000\n   :target: https://github.com/vi3k6i5/flashtext/blob/master/LICENSE\n   :alt: license\n\n\nThis module can be used to replace keywords in sentences or extract keywords from sentences. It is based on the `FlashText algorithm <https://arxiv.org/abs/1711.00046>`_.\n\n\nInstallation\n------------\n::\n\n    $ pip install flashtext\n\n\nAPI doc\n-------\n\nDocumentation can be found at `FlashText Read the Docs\n<http://flashtext.readthedocs.io/>`_.\n\n\nUsage\n-----\nExtract keywords\n    >>> from flashtext import KeywordProcessor\n    >>> keyword_processor = KeywordProcessor()\n    >>> # keyword_processor.add_keyword(<unclean name>, <standardised name>)\n    >>> keyword_processor.add_keyword('Big Apple', 'New York')\n    >>> keyword_processor.add_keyword('Bay Area')\n    >>> keywords_found = keyword_processor.extract_keywords('I love Big Apple and Bay Area.')\n    >>> keywords_found\n    >>> # ['New York', 'Bay Area']\n\nReplace keywords\n    >>> keyword_processor.add_keyword('New Delhi', 'NCR region')\n    >>> new_sentence = keyword_processor.replace_keywords('I love Big Apple and new delhi.')\n    >>> new_sentence\n    >>> # 'I love New York and NCR region.'\n\nCase Sensitive example\n    >>> from flashtext import KeywordProcessor\n    >>> keyword_processor = KeywordProcessor(case_sensitive=True)\n    >>> keyword_processor.add_keyword('Big Apple', 'New York')\n    >>> keyword_processor.add_keyword('Bay Area')\n    >>> keywords_found = keyword_processor.extract_keywords('I love big Apple and Bay Area.')\n    >>> keywords_found\n    >>> # ['Bay Area']\n\nSpan of keywords extracted\n    >>> from flashtext import KeywordProcessor\n    >>> keyword_processor = KeywordProcessor()\n    >>> keyword_processor.add_keyword('Big Apple', 'New York')\n    >>> keyword_processor.add_keyword('Bay Area')\n    >>> keywords_found = keyword_processor.extract_keywords('I love big Apple and Bay Area.', span_info=True)\n    >>> keywords_found\n    >>> # [('New York', 7, 16), ('Bay Area', 21, 29)]\n\nGet Extra information with keywords extracted\n    >>> from flashtext import KeywordProcessor\n    >>> kp = KeywordProcessor()\n    >>> kp.add_keyword('Taj Mahal', ('Monument', 'Taj Mahal'))\n    >>> kp.add_keyword('Delhi', ('Location', 'Delhi'))\n    >>> kp.extract_keywords('Taj Mahal is in Delhi.')\n    >>> # [('Monument', 'Taj Mahal'), ('Location', 'Delhi')]\n    >>> # NOTE: replace_keywords feature won't work with this.\n\nNo clean name for Keywords\n    >>> from flashtext import KeywordProcessor\n    >>> keyword_processor = KeywordProcessor()\n    >>> keyword_processor.add_keyword('Big Apple')\n    >>> keyword_processor.add_keyword('Bay Area')\n    >>> keywords_found = keyword_processor.extract_keywords('I love big Apple and Bay Area.')\n    >>> keywords_found\n    >>> # ['Big Apple', 'Bay Area']\n\nAdd Multiple Keywords simultaneously\n    >>> from flashtext import KeywordProcessor\n    >>> keyword_processor = KeywordProcessor()\n    >>> keyword_dict = {\n    >>>     \"java\": [\"java_2e\", \"java programing\"],\n    >>>     \"product management\": [\"PM\", \"product manager\"]\n    >>> }\n    >>> # {'clean_name': ['list of unclean names']}\n    >>> keyword_processor.add_keywords_from_dict(keyword_dict)\n    >>> # Or add keywords from a list:\n    >>> keyword_processor.add_keywords_from_list([\"java\", \"python\"])\n    >>> keyword_processor.extract_keywords('I am a product manager for a java_2e platform')\n    >>> # output ['product management', 'java']\n\nTo Remove keywords\n    >>> from flashtext import KeywordProcessor\n    >>> keyword_processor = KeywordProcessor()\n    >>> keyword_dict = {\n    >>>     \"java\": [\"java_2e\", \"java programing\"],\n    >>>     \"product management\": [\"PM\", \"product manager\"]\n    >>> }\n    >>> keyword_processor.add_keywords_from_dict(keyword_dict)\n    >>> print(keyword_processor.extract_keywords('I am a product manager for a java_2e platform'))\n    >>> # output ['product management', 'java']\n    >>> keyword_processor.remove_keyword('java_2e')\n    >>> # you can also remove keywords from a list/ dictionary\n    >>> keyword_processor.remove_keywords_from_dict({\"product management\": [\"PM\"]})\n    >>> keyword_processor.remove_keywords_from_list([\"java programing\"])\n    >>> keyword_processor.extract_keywords('I am a product manager for a java_2e platform')\n    >>> # output ['product management']\n\nTo check Number of terms in KeywordProcessor\n    >>> from flashtext import KeywordProcessor\n    >>> keyword_processor = KeywordProcessor()\n    >>> keyword_dict = {\n    >>>     \"java\": [\"java_2e\", \"java programing\"],\n    >>>     \"product management\": [\"PM\", \"product manager\"]\n    >>> }\n    >>> keyword_processor.add_keywords_from_dict(keyword_dict)\n    >>> print(len(keyword_processor))\n    >>> # output 4\n\nTo check if term is present in KeywordProcessor\n    >>> from flashtext import KeywordProcessor\n    >>> keyword_processor = KeywordProcessor()\n    >>> keyword_processor.add_keyword('j2ee', 'Java')\n    >>> 'j2ee' in keyword_processor\n    >>> # output: True\n    >>> keyword_processor.get_keyword('j2ee')\n    >>> # output: Java\n    >>> keyword_processor['colour'] = 'color'\n    >>> keyword_processor['colour']\n    >>> # output: color\n\nGet all keywords in dictionary\n    >>> from flashtext import KeywordProcessor\n    >>> keyword_processor = KeywordProcessor()\n    >>> keyword_processor.add_keyword('j2ee', 'Java')\n    >>> keyword_processor.add_keyword('colour', 'color')\n    >>> keyword_processor.get_all_keywords()\n    >>> # output: {'colour': 'color', 'j2ee': 'Java'}\n\nFor detecting Word Boundary currently any character other than this `\\\\w` `[A-Za-z0-9_]` is considered a word boundary.\n\nTo set or add characters as part of word characters\n    >>> from flashtext import KeywordProcessor\n    >>> keyword_processor = KeywordProcessor()\n    >>> keyword_processor.add_keyword('Big Apple')\n    >>> print(keyword_processor.extract_keywords('I love Big Apple/Bay Area.'))\n    >>> # ['Big Apple']\n    >>> keyword_processor.add_non_word_boundary('/')\n    >>> print(keyword_processor.extract_keywords('I love Big Apple/Bay Area.'))\n    >>> # []\n\n\nTest\n----\n::\n\n    $ git clone https://github.com/vi3k6i5/flashtext\n    $ cd flashtext\n    $ pip install pytest\n    $ python setup.py test\n\n\nBuild Docs\n----------\n::\n\n    $ git clone https://github.com/vi3k6i5/flashtext\n    $ cd flashtext/docs\n    $ pip install sphinx\n    $ make html\n    $ # open _build/html/index.html in browser to view it locally\n\n\nWhy not Regex?\n--------------\n\nIt's a custom algorithm based on `Aho-Corasick algorithm\n<https://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_algorithm>`_ and `Trie Dictionary\n<https://en.wikipedia.org/wiki/Trie Dictionary>`_.\n\n.. image:: https://github.com/vi3k6i5/flashtext/raw/master/benchmark.png\n   :target: https://twitter.com/RadimRehurek/status/904989624589803520\n   :alt: Benchmark\n\n\nTime taken by FlashText to find terms in comparison to Regex.\n\n.. image:: https://thepracticaldev.s3.amazonaws.com/i/xruf50n6z1r37ti8rd89.png\n\n\nTime taken by FlashText to replace terms in comparison to Regex.\n\n.. image:: https://thepracticaldev.s3.amazonaws.com/i/k44ghwp8o712dm58debj.png\n\nLink to code for benchmarking the `Find Feature <https://gist.github.com/vi3k6i5/604eefd92866d081cfa19f862224e4a0>`_ and `Replace Feature <https://gist.github.com/vi3k6i5/dc3335ee46ab9f650b19885e8ade6c7a>`_.\n\nThe idea for this library came from the following `StackOverflow question\n<https://stackoverflow.com/questions/44178449/regex-replace-is-taking-time-for-millions-of-documents-how-to-make-it-faster>`_.\n\n\nCitation\n----------\n\nThe original paper published on `FlashText algorithm <https://arxiv.org/abs/1711.00046>`_.\n\n::\n\n    @ARTICLE{2017arXiv171100046S,\n       author = {{Singh}, V.},\n        title = \"{Replace or Retrieve Keywords In Documents at Scale}\",\n      journal = {ArXiv e-prints},\n    archivePrefix = \"arXiv\",\n       eprint = {1711.00046},\n     primaryClass = \"cs.DS\",\n     keywords = {Computer Science - Data Structures and Algorithms},\n         year = 2017,\n        month = oct,\n       adsurl = {http://adsabs.harvard.edu/abs/2017arXiv171100046S},\n      adsnote = {Provided by the SAO/NASA Astrophysics Data System}\n    }\n\nThe article published on `Medium freeCodeCamp <https://medium.freecodecamp.org/regex-was-taking-5-days-flashtext-does-it-in-15-minutes-55f04411025f>`_.\n\n\nContribute\n----------\n\n- Issue Tracker: https://github.com/vi3k6i5/flashtext/issues\n- Source Code: https://github.com/vi3k6i5/flashtext/\n\n\nLicense\n-------\n\nThe project is licensed under the MIT license.",
    "bugtrack_url": null,
    "license": "",
    "summary": "Extract/Replaces keywords in sentences.",
    "version": "2.7",
    "split_keywords": [],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "md5": "2a8b58110dffa7ddbc68751542c9e20e",
                "sha256": "a1be2b93e09d4f0deee4aad72b91a7127b61fb8b8034ca9a9c78ea745d8b05cf"
            },
            "downloads": -1,
            "filename": "flashtext-2.7.tar.gz",
            "has_sig": false,
            "md5_digest": "2a8b58110dffa7ddbc68751542c9e20e",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": null,
            "size": 14536,
            "upload_time": "2018-02-16T05:24:17",
            "upload_time_iso_8601": "2018-02-16T05:24:17.232890Z",
            "url": "https://files.pythonhosted.org/packages/81/d8/2cd0656eae456d615c2f1efbcae8dfca2cb871a31f34ba8925aba47d5e09/flashtext-2.7.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2018-02-16 05:24:17",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "github_user": "vi3k6i5",
    "github_project": "flashtext",
    "travis_ci": true,
    "coveralls": true,
    "github_actions": false,
    "lcname": "flashtext"
}
        
Elapsed time: 0.03260s