Smart-Chunker
===============
This smart chunker is a semantic chunker to prepare a
long document for retrieval augmented generation (RAG).
Unlike a usual chunker, it does not split the text into
identical groups of N tokens. Instead, it uses a cross-encoder
to calculate the similarity function between neighboring
sentences and divides the text based on the most significant
boundaries of semantic transitions, i.e. minima in the
above-mentioned similarity function.
The BAAI/bge-reranker-v2-m3, or any other model that supports the
AutoModelForSequenceClassification interface, should be used
as a cross encoder.
The smart chunker supports Russian and English.
Raw data
{
"_id": null,
"home_page": "https://github.com/bond005/smart_chunker",
"name": "smart-chunker",
"maintainer": null,
"docs_url": null,
"requires_python": null,
"maintainer_email": null,
"keywords": "smart-chunker, rag, chunker, cross-encoder, encoder, reranker",
"author": "Ivan Bondarenko",
"author_email": "bond005@yandex.ru",
"download_url": "https://files.pythonhosted.org/packages/e2/79/b9b49bc5b87c9209e38faa16ed6582ce82464d812fa85c0e8b9f22d491b3/smart_chunker-0.0.2.tar.gz",
"platform": null,
"description": "\nSmart-Chunker\n===============\n\nThis smart chunker is a semantic chunker to prepare a\nlong document for retrieval augmented generation (RAG).\n\nUnlike a usual chunker, it does not split the text into\nidentical groups of N tokens. Instead, it uses a cross-encoder\nto calculate the similarity function between neighboring\nsentences and divides the text based on the most significant\nboundaries of semantic transitions, i.e. minima in the\nabove-mentioned similarity function.\n\nThe BAAI/bge-reranker-v2-m3, or any other model that supports the\nAutoModelForSequenceClassification interface, should be used\nas a cross encoder.\n\nThe smart chunker supports Russian and English.\n",
"bugtrack_url": null,
"license": "Apache License Version 2.0",
"summary": "Smart-Chunker is a semantic chunker to prepare a long document for RAG",
"version": "0.0.2",
"project_urls": {
"Homepage": "https://github.com/bond005/smart_chunker"
},
"split_keywords": [
"smart-chunker",
" rag",
" chunker",
" cross-encoder",
" encoder",
" reranker"
],
"urls": [
{
"comment_text": null,
"digests": {
"blake2b_256": "a0c04a03ea4ebfaf79742e486a9a9d71e29e683392fa78b2be802106f46f57ab",
"md5": "63033e24b94fb97ef6c03a4ed7ba13dc",
"sha256": "c85df749f8e8025d924738f65fddb72cc67876eaa152c2169ab97d0503a3af5e"
},
"downloads": -1,
"filename": "smart_chunker-0.0.2-py3-none-any.whl",
"has_sig": false,
"md5_digest": "63033e24b94fb97ef6c03a4ed7ba13dc",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": null,
"size": 9454,
"upload_time": "2025-07-11T11:28:06",
"upload_time_iso_8601": "2025-07-11T11:28:06.667318Z",
"url": "https://files.pythonhosted.org/packages/a0/c0/4a03ea4ebfaf79742e486a9a9d71e29e683392fa78b2be802106f46f57ab/smart_chunker-0.0.2-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": null,
"digests": {
"blake2b_256": "e279b9b49bc5b87c9209e38faa16ed6582ce82464d812fa85c0e8b9f22d491b3",
"md5": "50dca8626863ae3ca5ca33ad43050d14",
"sha256": "2bc8367f0556be7c1b45ea678d91d58243462bf7e7d196abedd8c57f7be40243"
},
"downloads": -1,
"filename": "smart_chunker-0.0.2.tar.gz",
"has_sig": false,
"md5_digest": "50dca8626863ae3ca5ca33ad43050d14",
"packagetype": "sdist",
"python_version": "source",
"requires_python": null,
"size": 3893953,
"upload_time": "2025-07-11T11:28:17",
"upload_time_iso_8601": "2025-07-11T11:28:17.908974Z",
"url": "https://files.pythonhosted.org/packages/e2/79/b9b49bc5b87c9209e38faa16ed6582ce82464d812fa85c0e8b9f22d491b3/smart_chunker-0.0.2.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2025-07-11 11:28:17",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "bond005",
"github_project": "smart_chunker",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"requirements": [
{
"name": "nltk",
"specs": []
},
{
"name": "nltk-punkt",
"specs": []
},
{
"name": "razdel",
"specs": [
[
"==",
"0.5.0"
]
]
},
{
"name": "sentencepiece",
"specs": []
},
{
"name": "torch",
"specs": [
[
">=",
"2.0.1"
]
]
},
{
"name": "transformers",
"specs": [
[
">=",
"4.38.1"
]
]
}
],
"lcname": "smart-chunker"
}