smart-chunker

Name	smart-chunker JSON
Version	0.0.2 JSON
	download
home_page	https://github.com/bond005/smart_chunker
Summary	Smart-Chunker is a semantic chunker to prepare a long document for RAG
upload_time	2025-07-11 11:28:17
maintainer	None
docs_url	None
author	Ivan Bondarenko
requires_python	None
license	Apache License Version 2.0
keywords	smart-chunker rag chunker cross-encoder encoder reranker
VCS
bugtrack_url
requirements	nltk nltk-punkt razdel sentencepiece torch transformers
Travis-CI	No Travis.
coveralls test coverage	No coveralls.

            
Smart-Chunker
===============

This smart chunker is a semantic chunker to prepare a
long document for retrieval augmented generation (RAG).

Unlike a usual chunker, it does not split the text into
identical groups of N tokens. Instead, it uses a cross-encoder
to calculate the similarity function between neighboring
sentences and divides the text based on the most significant
boundaries of semantic transitions, i.e. minima in the
above-mentioned similarity function.

The BAAI/bge-reranker-v2-m3, or any other model that supports the
AutoModelForSequenceClassification interface, should be used
as a cross encoder.

The smart chunker supports Russian and English.

Raw data

            {
    "_id": null,
    "home_page": "https://github.com/bond005/smart_chunker",
    "name": "smart-chunker",
    "maintainer": null,
    "docs_url": null,
    "requires_python": null,
    "maintainer_email": null,
    "keywords": "smart-chunker, rag, chunker, cross-encoder, encoder, reranker",
    "author": "Ivan Bondarenko",
    "author_email": "bond005@yandex.ru",
    "download_url": "https://files.pythonhosted.org/packages/e2/79/b9b49bc5b87c9209e38faa16ed6582ce82464d812fa85c0e8b9f22d491b3/smart_chunker-0.0.2.tar.gz",
    "platform": null,
    "description": "\nSmart-Chunker\n===============\n\nThis smart chunker is a semantic chunker to prepare a\nlong document for retrieval augmented generation (RAG).\n\nUnlike a usual chunker, it does not split the text into\nidentical groups of N tokens. Instead, it uses a cross-encoder\nto calculate the similarity function between neighboring\nsentences and divides the text based on the most significant\nboundaries of semantic transitions, i.e. minima in the\nabove-mentioned similarity function.\n\nThe BAAI/bge-reranker-v2-m3, or any other model that supports the\nAutoModelForSequenceClassification interface, should be used\nas a cross encoder.\n\nThe smart chunker supports Russian and English.\n",
    "bugtrack_url": null,
    "license": "Apache License Version 2.0",
    "summary": "Smart-Chunker is a semantic chunker to prepare a long document for RAG",
    "version": "0.0.2",
    "project_urls": {
        "Homepage": "https://github.com/bond005/smart_chunker"
    },
    "split_keywords": [
        "smart-chunker",
        " rag",
        " chunker",
        " cross-encoder",
        " encoder",
        " reranker"
    ],
    "urls": [
        {
            "comment_text": null,
            "digests": {
                "blake2b_256": "a0c04a03ea4ebfaf79742e486a9a9d71e29e683392fa78b2be802106f46f57ab",
                "md5": "63033e24b94fb97ef6c03a4ed7ba13dc",
                "sha256": "c85df749f8e8025d924738f65fddb72cc67876eaa152c2169ab97d0503a3af5e"
            },
            "downloads": -1,
            "filename": "smart_chunker-0.0.2-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "63033e24b94fb97ef6c03a4ed7ba13dc",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": null,
            "size": 9454,
            "upload_time": "2025-07-11T11:28:06",
            "upload_time_iso_8601": "2025-07-11T11:28:06.667318Z",
            "url": "https://files.pythonhosted.org/packages/a0/c0/4a03ea4ebfaf79742e486a9a9d71e29e683392fa78b2be802106f46f57ab/smart_chunker-0.0.2-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": null,
            "digests": {
                "blake2b_256": "e279b9b49bc5b87c9209e38faa16ed6582ce82464d812fa85c0e8b9f22d491b3",
                "md5": "50dca8626863ae3ca5ca33ad43050d14",
                "sha256": "2bc8367f0556be7c1b45ea678d91d58243462bf7e7d196abedd8c57f7be40243"
            },
            "downloads": -1,
            "filename": "smart_chunker-0.0.2.tar.gz",
            "has_sig": false,
            "md5_digest": "50dca8626863ae3ca5ca33ad43050d14",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": null,
            "size": 3893953,
            "upload_time": "2025-07-11T11:28:17",
            "upload_time_iso_8601": "2025-07-11T11:28:17.908974Z",
            "url": "https://files.pythonhosted.org/packages/e2/79/b9b49bc5b87c9209e38faa16ed6582ce82464d812fa85c0e8b9f22d491b3/smart_chunker-0.0.2.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2025-07-11 11:28:17",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "bond005",
    "github_project": "smart_chunker",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": false,
    "requirements": [
        {
            "name": "nltk",
            "specs": []
        },
        {
            "name": "nltk-punkt",
            "specs": []
        },
        {
            "name": "razdel",
            "specs": [
                [
                    "==",
                    "0.5.0"
                ]
            ]
        },
        {
            "name": "sentencepiece",
            "specs": []
        },
        {
            "name": "torch",
            "specs": [
                [
                    ">=",
                    "2.0.1"
                ]
            ]
        },
        {
            "name": "transformers",
            "specs": [
                [
                    ">=",
                    "4.38.1"
                ]
            ]
        }
    ],
    "lcname": "smart-chunker"
}

Ivan Bondarenko