nanofts


Namenanofts JSON
Version 0.1.0 PyPI version JSON
download
home_pageNone
SummaryA lightweight full-text search library for Python
upload_time2025-08-02 13:03:27
maintainerNone
docs_urlNone
authorNone
requires_python>=3.9
licenseApache-2.0
keywords full-text-search indexing search-engine chinese-text fuzzy-search
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI No Travis.
coveralls test coverage No coveralls.
            # NanoFTS

A lightweight full-text search engine implementation in Python, featuring efficient indexing and searching capabilities for both English and Chinese text.

## Features

- Lightweight and efficient full-text search implementation
- Pure Python with minimal dependencies (only requires `pyroaring` and `msgpack`)
- Support for both English and Chinese text
- Memory-efficient disk-based index storage with sharding
- Incremental indexing and real-time updates
- Case-insensitive search
- Phrase matching support
- **🔍 Fuzzy Search Support**: Intelligent fuzzy matching with configurable similarity thresholds
- **📝 Document Management**: Full CRUD operations (Create, Read, Update, Delete)
- Built-in LRU caching for frequently accessed terms
- Data import support from popular formats:
  - Pandas DataFrame
  - Polars DataFrame
  - Apache Arrow Table
  - Parquet files
  - CSV files

## Installation

```bash
# Basic installation
pip install nanofts

# With pandas support
pip install nanofts[pandas]

# With polars support
pip install nanofts[polars]

# With Apache Arrow/Parquet support
pip install nanofts[pyarrow]

# Install all optional dependencies
pip install nanofts[all]

# Development dependencies (for contributors)
pip install nanofts[dev]
```

## Usage

### Quick Start
```python
from nanofts import FullTextSearch

# Initialize with fuzzy search support
fts = FullTextSearch(index_dir="./index", fuzzy_threshold=0.6)

# Add documents
fts.add_document(1, {"title": "Python教程", "content": "学习Python编程"})
fts.add_document(2, {"title": "数据分析", "content": "使用pandas进行数据处理"})
fts.flush()

# Search with typo handling
results = fts.fuzzy_search("Pytho教成")  # Finds "Python教程" despite typos
print(f"Found {len(results)} documents")

# Update and delete documents
fts.update_document(1, {"title": "高级Python教程"})
fts.remove_document(2)
```

### Basic Example
```python
from nanofts import FullTextSearch

# Create a new search instance with disk storage
fts = FullTextSearch(index_dir="./index")

# Add single document
fts.add_document(1, {
    "title": "Hello World",
    "content": "Python full-text search engine"
})

# Add multiple documents at once
docs = [
    {"title": "全文搜索", "content": "支持中文搜索功能"},
    {"title": "Mixed Text", "content": "Support both English and 中文"}
]
fts.add_document([2, 3], docs)

# Don't forget to flush after adding documents
fts.flush()

# Search for documents
results = fts.search("python search")  # Case-insensitive search
print(results)  # Returns list of matching document IDs

# Chinese text search
results = fts.search("全文搜索")
print(results)
```

### Fuzzy Search
```python
# Enable fuzzy search for typos and similar words
fts = FullTextSearch(
    index_dir="./index",
    fuzzy_threshold=0.6,      # Similarity threshold (0.0-1.0)
    fuzzy_max_distance=2      # Maximum edit distance
)

# Add some documents
fts.add_document(1, {"title": "苹果手机", "content": "最新的iPhone产品"})
fts.add_document(2, {"title": "编程教程", "content": "Python开发指南"})
fts.flush()

# Exact search
exact_results = fts.search("苹果", enable_fuzzy=False)
print(f"Exact search: {len(exact_results)} results")

# Fuzzy search for typos (苹檎 instead of 苹果)
fuzzy_results = fts.search("苹檎", enable_fuzzy=True, min_results=1)
print(f"Fuzzy search: {len(fuzzy_results)} results")

# Convenient fuzzy search method
results = fts.fuzzy_search("编成")  # 编成 -> 编程
print(f"Fuzzy search results: {results}")

# Configure fuzzy search parameters
fts.set_fuzzy_config(fuzzy_threshold=0.8, fuzzy_max_distance=1)
config = fts.get_fuzzy_config()
print(f"Current config: {config}")
```

### Document Management (CRUD Operations)
```python
# Create: Add documents (already shown above)
fts.add_document(1, {"title": "Document 1", "content": "Content 1"})

# Read: Search documents (already shown above)
results = fts.search("Document")

# Update: Modify existing documents
fts.update_document(1, {"title": "Updated Document", "content": "Updated Content"})

# Batch update multiple documents
fts.update_document([1, 2], [
    {"title": "New Title 1", "content": "New Content 1"},
    {"title": "New Title 2", "content": "New Content 2"}
])

# Delete: Remove documents
fts.remove_document(1)  # Remove single document

# Batch delete multiple documents
fts.remove_document([2, 3, 4])  # Remove multiple documents
```

### Data Import from Different Sources
```python
# Import from pandas DataFrame
import pandas as pd

df = pd.DataFrame({
    'id': [1, 2, 3],
    'title': ['Hello World', '全文搜索', 'Test Document'],
    'content': ['This is a test', '支持多语言', 'Another test']
})

fts = FullTextSearch(index_dir="./index")
fts.from_pandas(df, id_column='id')

# Import from Polars DataFrame
import polars as pl
df = pl.DataFrame(...)
fts.from_polars(df, id_column='id')

# Import from Arrow Table
import pyarrow as pa
table = pa.Table.from_pandas(df)
fts.from_arrow(table, id_column='id')

# Import from Parquet file
fts.from_parquet("documents.parquet", id_column='id')

# Import from CSV file
fts.from_csv("documents.csv", id_column='id')
```

### Advanced Configuration
```python
fts = FullTextSearch(
    index_dir="./index",           # Index storage directory
    max_chinese_length=4,          # Maximum length for Chinese substrings
    num_workers=4,                 # Number of parallel workers
    shard_size=100_000,           # Documents per shard
    min_term_length=2,            # Minimum term length to index
    auto_save=True,               # Auto-save to disk
    batch_size=1000,              # Batch processing size
    buffer_size=10000,            # Memory buffer size
    drop_if_exists=False,         # Whether to drop existing index
    fuzzy_threshold=0.4,          # Fuzzy search similarity threshold (0.0-1.0)
    fuzzy_max_distance=2          # Maximum edit distance for fuzzy search
)
```

## Implementation Details

- Uses `pyroaring` for efficient bitmap operations
- Implements sharding for large-scale indexes
- LRU caching for frequently accessed terms
- Parallel processing for batch indexing
- Incremental updates with memory buffer
- Disk-based storage with msgpack serialization
- Support for both exact and phrase matching
- Efficient Chinese text substring indexing
- **Fuzzy Search Features**:
  - Zero I/O overhead: completely in-memory fuzzy matching
  - Intelligent activation: automatically enabled when exact results are insufficient
  - Configurable similarity thresholds and edit distance
  - Support for both Chinese and English fuzzy matching
  - Built-in caching for repeated fuzzy queries
- **Document Management**:
  - Full CRUD operations with atomic updates
  - Batch operations for high-performance updates
  - Incremental saving for modified documents

## License

This project is licensed under the Apache License 2.0 - see the LICENSE file for details.

## Contributing

Contributions are welcome! Please feel free to submit a Pull Request.

            

Raw data

            {
    "_id": null,
    "home_page": null,
    "name": "nanofts",
    "maintainer": null,
    "docs_url": null,
    "requires_python": ">=3.9",
    "maintainer_email": null,
    "keywords": "full-text-search, indexing, search-engine, chinese-text, fuzzy-search",
    "author": null,
    "author_email": "Birch Kwok <birchkwok@gmail.com>",
    "download_url": "https://files.pythonhosted.org/packages/21/86/0ca49515c281946267dcfe67e7eb4e885e650807c66929cff1448ab787c8/nanofts-0.1.0.tar.gz",
    "platform": null,
    "description": "# NanoFTS\n\nA lightweight full-text search engine implementation in Python, featuring efficient indexing and searching capabilities for both English and Chinese text.\n\n## Features\n\n- Lightweight and efficient full-text search implementation\n- Pure Python with minimal dependencies (only requires `pyroaring` and `msgpack`)\n- Support for both English and Chinese text\n- Memory-efficient disk-based index storage with sharding\n- Incremental indexing and real-time updates\n- Case-insensitive search\n- Phrase matching support\n- **\ud83d\udd0d Fuzzy Search Support**: Intelligent fuzzy matching with configurable similarity thresholds\n- **\ud83d\udcdd Document Management**: Full CRUD operations (Create, Read, Update, Delete)\n- Built-in LRU caching for frequently accessed terms\n- Data import support from popular formats:\n  - Pandas DataFrame\n  - Polars DataFrame\n  - Apache Arrow Table\n  - Parquet files\n  - CSV files\n\n## Installation\n\n```bash\n# Basic installation\npip install nanofts\n\n# With pandas support\npip install nanofts[pandas]\n\n# With polars support\npip install nanofts[polars]\n\n# With Apache Arrow/Parquet support\npip install nanofts[pyarrow]\n\n# Install all optional dependencies\npip install nanofts[all]\n\n# Development dependencies (for contributors)\npip install nanofts[dev]\n```\n\n## Usage\n\n### Quick Start\n```python\nfrom nanofts import FullTextSearch\n\n# Initialize with fuzzy search support\nfts = FullTextSearch(index_dir=\"./index\", fuzzy_threshold=0.6)\n\n# Add documents\nfts.add_document(1, {\"title\": \"Python\u6559\u7a0b\", \"content\": \"\u5b66\u4e60Python\u7f16\u7a0b\"})\nfts.add_document(2, {\"title\": \"\u6570\u636e\u5206\u6790\", \"content\": \"\u4f7f\u7528pandas\u8fdb\u884c\u6570\u636e\u5904\u7406\"})\nfts.flush()\n\n# Search with typo handling\nresults = fts.fuzzy_search(\"Pytho\u6559\u6210\")  # Finds \"Python\u6559\u7a0b\" despite typos\nprint(f\"Found {len(results)} documents\")\n\n# Update and delete documents\nfts.update_document(1, {\"title\": \"\u9ad8\u7ea7Python\u6559\u7a0b\"})\nfts.remove_document(2)\n```\n\n### Basic Example\n```python\nfrom nanofts import FullTextSearch\n\n# Create a new search instance with disk storage\nfts = FullTextSearch(index_dir=\"./index\")\n\n# Add single document\nfts.add_document(1, {\n    \"title\": \"Hello World\",\n    \"content\": \"Python full-text search engine\"\n})\n\n# Add multiple documents at once\ndocs = [\n    {\"title\": \"\u5168\u6587\u641c\u7d22\", \"content\": \"\u652f\u6301\u4e2d\u6587\u641c\u7d22\u529f\u80fd\"},\n    {\"title\": \"Mixed Text\", \"content\": \"Support both English and \u4e2d\u6587\"}\n]\nfts.add_document([2, 3], docs)\n\n# Don't forget to flush after adding documents\nfts.flush()\n\n# Search for documents\nresults = fts.search(\"python search\")  # Case-insensitive search\nprint(results)  # Returns list of matching document IDs\n\n# Chinese text search\nresults = fts.search(\"\u5168\u6587\u641c\u7d22\")\nprint(results)\n```\n\n### Fuzzy Search\n```python\n# Enable fuzzy search for typos and similar words\nfts = FullTextSearch(\n    index_dir=\"./index\",\n    fuzzy_threshold=0.6,      # Similarity threshold (0.0-1.0)\n    fuzzy_max_distance=2      # Maximum edit distance\n)\n\n# Add some documents\nfts.add_document(1, {\"title\": \"\u82f9\u679c\u624b\u673a\", \"content\": \"\u6700\u65b0\u7684iPhone\u4ea7\u54c1\"})\nfts.add_document(2, {\"title\": \"\u7f16\u7a0b\u6559\u7a0b\", \"content\": \"Python\u5f00\u53d1\u6307\u5357\"})\nfts.flush()\n\n# Exact search\nexact_results = fts.search(\"\u82f9\u679c\", enable_fuzzy=False)\nprint(f\"Exact search: {len(exact_results)} results\")\n\n# Fuzzy search for typos (\u82f9\u6a8e instead of \u82f9\u679c)\nfuzzy_results = fts.search(\"\u82f9\u6a8e\", enable_fuzzy=True, min_results=1)\nprint(f\"Fuzzy search: {len(fuzzy_results)} results\")\n\n# Convenient fuzzy search method\nresults = fts.fuzzy_search(\"\u7f16\u6210\")  # \u7f16\u6210 -> \u7f16\u7a0b\nprint(f\"Fuzzy search results: {results}\")\n\n# Configure fuzzy search parameters\nfts.set_fuzzy_config(fuzzy_threshold=0.8, fuzzy_max_distance=1)\nconfig = fts.get_fuzzy_config()\nprint(f\"Current config: {config}\")\n```\n\n### Document Management (CRUD Operations)\n```python\n# Create: Add documents (already shown above)\nfts.add_document(1, {\"title\": \"Document 1\", \"content\": \"Content 1\"})\n\n# Read: Search documents (already shown above)\nresults = fts.search(\"Document\")\n\n# Update: Modify existing documents\nfts.update_document(1, {\"title\": \"Updated Document\", \"content\": \"Updated Content\"})\n\n# Batch update multiple documents\nfts.update_document([1, 2], [\n    {\"title\": \"New Title 1\", \"content\": \"New Content 1\"},\n    {\"title\": \"New Title 2\", \"content\": \"New Content 2\"}\n])\n\n# Delete: Remove documents\nfts.remove_document(1)  # Remove single document\n\n# Batch delete multiple documents\nfts.remove_document([2, 3, 4])  # Remove multiple documents\n```\n\n### Data Import from Different Sources\n```python\n# Import from pandas DataFrame\nimport pandas as pd\n\ndf = pd.DataFrame({\n    'id': [1, 2, 3],\n    'title': ['Hello World', '\u5168\u6587\u641c\u7d22', 'Test Document'],\n    'content': ['This is a test', '\u652f\u6301\u591a\u8bed\u8a00', 'Another test']\n})\n\nfts = FullTextSearch(index_dir=\"./index\")\nfts.from_pandas(df, id_column='id')\n\n# Import from Polars DataFrame\nimport polars as pl\ndf = pl.DataFrame(...)\nfts.from_polars(df, id_column='id')\n\n# Import from Arrow Table\nimport pyarrow as pa\ntable = pa.Table.from_pandas(df)\nfts.from_arrow(table, id_column='id')\n\n# Import from Parquet file\nfts.from_parquet(\"documents.parquet\", id_column='id')\n\n# Import from CSV file\nfts.from_csv(\"documents.csv\", id_column='id')\n```\n\n### Advanced Configuration\n```python\nfts = FullTextSearch(\n    index_dir=\"./index\",           # Index storage directory\n    max_chinese_length=4,          # Maximum length for Chinese substrings\n    num_workers=4,                 # Number of parallel workers\n    shard_size=100_000,           # Documents per shard\n    min_term_length=2,            # Minimum term length to index\n    auto_save=True,               # Auto-save to disk\n    batch_size=1000,              # Batch processing size\n    buffer_size=10000,            # Memory buffer size\n    drop_if_exists=False,         # Whether to drop existing index\n    fuzzy_threshold=0.4,          # Fuzzy search similarity threshold (0.0-1.0)\n    fuzzy_max_distance=2          # Maximum edit distance for fuzzy search\n)\n```\n\n## Implementation Details\n\n- Uses `pyroaring` for efficient bitmap operations\n- Implements sharding for large-scale indexes\n- LRU caching for frequently accessed terms\n- Parallel processing for batch indexing\n- Incremental updates with memory buffer\n- Disk-based storage with msgpack serialization\n- Support for both exact and phrase matching\n- Efficient Chinese text substring indexing\n- **Fuzzy Search Features**:\n  - Zero I/O overhead: completely in-memory fuzzy matching\n  - Intelligent activation: automatically enabled when exact results are insufficient\n  - Configurable similarity thresholds and edit distance\n  - Support for both Chinese and English fuzzy matching\n  - Built-in caching for repeated fuzzy queries\n- **Document Management**:\n  - Full CRUD operations with atomic updates\n  - Batch operations for high-performance updates\n  - Incremental saving for modified documents\n\n## License\n\nThis project is licensed under the Apache License 2.0 - see the LICENSE file for details.\n\n## Contributing\n\nContributions are welcome! Please feel free to submit a Pull Request.\n",
    "bugtrack_url": null,
    "license": "Apache-2.0",
    "summary": "A lightweight full-text search library for Python",
    "version": "0.1.0",
    "project_urls": {
        "Documentation": "https://github.com/BirchKwok/NanoFTS/blob/main/README.md",
        "Homepage": "https://github.com/BirchKwok/NanoFTS",
        "Issues": "https://github.com/BirchKwok/NanoFTS/issues",
        "Repository": "https://github.com/BirchKwok/NanoFTS"
    },
    "split_keywords": [
        "full-text-search",
        " indexing",
        " search-engine",
        " chinese-text",
        " fuzzy-search"
    ],
    "urls": [
        {
            "comment_text": null,
            "digests": {
                "blake2b_256": "d8ed33503e8fb30550c6d9ae65642eae2660fbc8436f6cb355a6544a5a64f0c1",
                "md5": "7fb7ea853f389e25babad44ad1d5f996",
                "sha256": "6208136d66c59135777e52e2b52ee455e526520380ada3e1e31099d830dd7951"
            },
            "downloads": -1,
            "filename": "nanofts-0.1.0-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "7fb7ea853f389e25babad44ad1d5f996",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": ">=3.9",
            "size": 26981,
            "upload_time": "2025-08-02T13:03:25",
            "upload_time_iso_8601": "2025-08-02T13:03:25.474742Z",
            "url": "https://files.pythonhosted.org/packages/d8/ed/33503e8fb30550c6d9ae65642eae2660fbc8436f6cb355a6544a5a64f0c1/nanofts-0.1.0-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": null,
            "digests": {
                "blake2b_256": "21860ca49515c281946267dcfe67e7eb4e885e650807c66929cff1448ab787c8",
                "md5": "7c0d4f828e025a385548483d4a44b03f",
                "sha256": "acba51e09985ae98a4d50e6e366657c98d1e34f2f219edc0bebd543809ca1058"
            },
            "downloads": -1,
            "filename": "nanofts-0.1.0.tar.gz",
            "has_sig": false,
            "md5_digest": "7c0d4f828e025a385548483d4a44b03f",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": ">=3.9",
            "size": 30635,
            "upload_time": "2025-08-02T13:03:27",
            "upload_time_iso_8601": "2025-08-02T13:03:27.284458Z",
            "url": "https://files.pythonhosted.org/packages/21/86/0ca49515c281946267dcfe67e7eb4e885e650807c66929cff1448ab787c8/nanofts-0.1.0.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2025-08-02 13:03:27",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "BirchKwok",
    "github_project": "NanoFTS",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": true,
    "lcname": "nanofts"
}
        
Elapsed time: 1.20090s