textlasso


Nametextlasso JSON
Version 0.1.2 PyPI version JSON
download
home_pageNone
SummarySimple packego for grab data from raw text.
upload_time2025-07-26 14:44:31
maintainerNone
docs_urlNone
authorNone
requires_python>=3.9
licenseNone
keywords llm text crawl extract text-cleaning
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI No Travis.
coveralls test coverage No coveralls.
            # TextLasso ๐Ÿค 

[![PyPI version](https://badge.fury.io/py/textlasso.svg)](https://badge.fury.io/py/textlasso)
[![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)

**TextLasso** is a simple Python library for extracting structured data from raw text, with special focus on processing LLM (Large Language Model) responses. Whether you're parsing JSON buried in markdown, extracting data from XML, or need to generate structured prompts for AI models, TextLasso has you covered.

## โœจ Key Features

- ๐ŸŽฏ **Smart Text Extraction**: Extract structured data from messy text with multiple fallback strategies
- ๐Ÿงน **LLM Response Cleaning**: Automatically clean code blocks, markdown artifacts, and formatting
- ๐Ÿ—๏ธ **Dataclass Integration**: Convert raw text directly to Python dataclasses with type validation
- ๐Ÿค– **AI Prompt Generation**: Generate structured prompts with schema validation and examples
- ๐Ÿ“Š **Multiple Formats**: Support for JSON, XML, and extensible to other formats
- ๐Ÿ”ง **Flexible Configuration**: Configurable error handling, logging, and validation modes
- ๐ŸŽจ **Decorator Support**: Enhance existing functions with structured output capabilities

## ๐Ÿš€ Quick Start

### Installation

```bash
pip install textlasso
```

### Basic Usage

```python
from dataclasses import dataclass
from typing import List, Optional
from textlasso import extract

@dataclass
class Person:
    name: str
    age: int
    email: Optional[str] = None
    skills: List[str] = None

# Extract from messy LLM response
llm_response = """
Here's the person data you requested:

\```json
{
    "name": "Alice Johnson",
    "age": 30,
    "email": "alice@company.com", 
    "skills": ["Python", "Machine Learning", "Data Science"]
}
\```

Hope this helps!
"""

person = extract(llm_response, Person, extract_strategy='json')
print(f"Extracted: {person.name}, {person.age} years old")
print(person)
# Extracted: Alice Johnson, 30 years old
# Person(name='Alice Johnson', age=30, email='alice@company.com', skills=['Python', 'Machine Learning', 'Data Science'])
```

## ๐Ÿ“š Comprehensive Examples

### 1. Basic Text Extraction

#### JSON Extraction with Fallback Strategies

```python
from dataclasses import dataclass
from typing import List, Optional
from textlasso import extract

@dataclass
class Product:
    name: str
    price: float
    category: str
    in_stock: bool
    tags: Optional[List[str]] = None

# Works with clean JSON
clean_json = '{"name": "Laptop", "price": 999.99, "category": "Electronics", "in_stock": true}'

# Works with markdown-wrapped JSON
markdown_json = """
Here's your product data:
```json
{
    "name": "Wireless Headphones",
    "price": 199.99,
    "category": "Electronics", 
    "in_stock": false,
    "tags": ["wireless", "bluetooth", "noise-canceling"]
}
\```
"""

# Works with messy responses
messy_response = """
Let me extract that product information for you...

The product details are: {"name": "Smart Watch", "price": 299.99, "category": "Wearables", "in_stock": true}

Is this what you were looking for?
"""

# All of these work automatically
products = [
    extract(clean_json, Product, extract_strategy='json'),
    extract(markdown_json, Product, extract_strategy='json'), 
    extract(messy_response, Product, extract_strategy='json')
]

for product in products:
    print(f"{product.name}: ${product.price} ({'โœ…' if product.in_stock else 'โŒ'})")
```

#### XML Extraction

```python
from dataclasses import dataclass
from typing import List, Optional
from textlasso import extract

@dataclass 
class Address:
    street: str
    city: str
    country: str
    zip_code: Optional[str] = None
    
@dataclass
class ResponseAddress:
    address: Address

xml_data = """
<address>
    <street>123 Main St</street>
    <city>San Francisco</city>
    <country>USA</country>
    <zip_code>94102</zip_code>
</address>
"""

response_address = extract(xml_data, ResponseAddress, extract_strategy='xml')
print(f"Address: {response_address.address.street}, {response_address.address.city}, {response_address.address.country}")
# Address: 123 Main St, San Francisco, USA
```

### 2. Complex Nested Data Structures

```python
from dataclasses import dataclass
from typing import List, Optional
from enum import Enum

class Department(Enum):
    ENGINEERING = "engineering"
    MARKETING = "marketing" 
    SALES = "sales"
    HR = "hr"

@dataclass
class Employee:
    id: int
    name: str
    department: Department
    salary: float
    skills: List[str]
    manager_id: Optional[int] = None

@dataclass
class Company:
    name: str
    founded_year: int
    employees: List[Employee]
    headquarters: Address

complex_json = """
{
    "name": "TechCorp Inc",
    "founded_year": 2015,
    "headquarters": {
        "street": "100 Tech Plaza",
        "city": "Austin", 
        "country": "USA",
        "zip_code": "78701"
    },
    "employees": [
        {
            "id": 1,
            "name": "Sarah Chen", 
            "department": "engineering",
            "salary": 120000,
            "skills": ["Python", "React", "AWS"],
            "manager_id": null
        },
        {
            "id": 2,
            "name": "Mike Rodriguez",
            "department": "marketing", 
            "salary": 85000,
            "skills": ["SEO", "Content Strategy", "Analytics"],
            "manager_id": 1
        }
    ]
}
"""

company = extract(complex_json, Company, extract_strategy='json')
print(f"Company: {company.name} ({company.founded_year})")
print(f"HQ: {company.headquarters.city}, {company.headquarters.country}")
print(f"Employees: {len(company.employees)}")

for emp in company.employees:
    print(f"  - {emp.name} ({emp.department.value}): {', '.join(emp.skills)}")

# HQ: Austin, USA
# Employees: 2
#   - Sarah Chen (engineering): Python, React, AWS
#   - Mike Rodriguez (marketing): SEO, Content Strategy, Analytics
```

### 3. LLM Response Cleaning

```python
from textlasso.cleaners import clear_llm_res

# Clean various LLM response formats
messy_responses = [
    "\```json\\n{\"key\": \"value\"}\\n\```",
    "\```\\n{\"key\": \"value\"}\\n\```", 
    "Here's the data: {\"key\": \"value\"} hope it helps!",
    "\```xml\\n<root><item>data</item></root>\\n\```"
]

for response in messy_responses:
    clean_json = clear_llm_res(response, extract_strategy='json')
    clean_xml = clear_llm_res(response, extract_strategy='xml')
    print(f"Original: {response}")
    print(f"JSON cleaned: {clean_json}")
    print(f"XML cleaned: {clean_xml}")
    print("---")
```

### 4. Advanced Data Extraction with Configuration

```python
from textlasso import extract_from_dict
import logging

# Configure custom logging
logger = logging.getLogger("my_extractor")
logger.setLevel(logging.DEBUG)

@dataclass
class FlexibleData:
    required_field: str
    optional_field: Optional[str] = None
    number_field: int = 0

# Strict mode - raises errors on type mismatches
data_with_extra = {
    "required_field": "test",
    "optional_field": "optional", 
    "number_field": "123",  # String instead of int
    "extra_field": "ignored"  # Extra field
}

# Strict mode (default)
try:
    result_strict = extract_from_dict(
        data_with_extra, 
        FlexibleData,
        strict_mode=True,
        ignore_extra_fields=True,
        logger=logger
    )
    print("Strict mode result:", result_strict)
except Exception as e:
    print("Strict mode error:", e)

# Flexible mode - attempts conversion
result_flexible = extract_from_dict(
    data_with_extra,
    FlexibleData, 
    strict_mode=False,
    ignore_extra_fields=True,
    logger=logger
)
print("Flexible mode result:", result_flexible)
```

### 5. Structured Prompt Generation

#### Basic Prompt Generation

```python
from textlasso import generate_structured_prompt

@dataclass
class UserFeedback:
    rating: int  # 1-5
    comment: str
    category: str
    recommended: bool
    issues: Optional[List[str]] = None

# Generate a structured prompt
prompt = generate_structured_prompt(
    prompt="Analyze this customer review and extract structured feedback",
    schema=UserFeedback,
    strategy="json",
    include_schema_description=True,
    example_count=2
)

print(prompt)
# Output:
# Analyze this customer review and extract structured feedback

# ## OUTPUT FORMAT REQUIREMENTS

# You must respond with a valid JSON object that follows this exact structure:

# ### Schema: UserFeedback
# - **rating**: int (required)
# - **comment**: str (required)
# - **category**: str (required)
# - **recommended**: bool (required)
# - **issues**: Array of str (optional)


# ### JSON Format Rules:
# - Use proper JSON syntax with double quotes for strings
# - Include all required fields
# - Use null for optional fields that are not provided
# - Arrays should contain objects matching the specified structure
# - Numbers should not be quoted
# - Booleans should be true/false (not quoted)


# ## EXAMPLES

# Here are 2 examples of the expected JSON format:

# ### Example 1:
# ```json
# {
#   "rating": 1,
#   "comment": "example_comment_1",
#   "category": "example_category_1",
#   "recommended": true,
#   "issues": [
#     "example_issues_item_1",
#     "example_issues_item_2"
#   ]
# }
# ```

# ### Example 2:
# ```json
# {
#   "rating": 2,
#   "comment": "example_comment_2",
#   "category": "example_category_2",
#   "recommended": false,
#   "issues": [
#     "example_issues_item_1",
#     "example_issues_item_2",
#     "example_issues_item_3"
#   ]
# }
# ```

# Remember: Your response must be valid JSON that matches the specified structure exactly.
```

#### Using the Decorator for Function Enhancement
If you have a prompt returning functions, you can use the `@structured_output` decorator to automatically enhance your prompts with structure requirements.

```python

from dataclasses import dataclass
from typing import Optional, List

from textlasso import structured_output

@dataclass
class NewsArticle:
    title: str
    summary: str
    category: str
    sentiment: str
    key_points: List[str]
    publication_date: Optional[str] = None

# decorate prompt-returning function
@structured_output(schema=NewsArticle, strategy="xml", example_count=1)
def create_article_analysis_prompt(article_text: str) -> str:
    return f"""
    Analyze the following news article and extract key information:
    
    Article: {article_text}
    
    Please provide a comprehensive analysis focusing on the main themes,
    sentiment, and key takeaways.
    """

# The decorator automatically enhances your prompt with structure requirements
article_text = "Breaking: New AI breakthrough announced by researchers..."
enhanced_prompt = create_article_analysis_prompt(article_text)

# This prompt now includes schema definitions, examples, and format requirements
print("Enhanced prompt: ", enhanced_prompt)

# Enhanced prompt:  
#     Analyze the following news article and extract key information:
    
#     Article: Breaking: New AI breakthrough announced by researchers...
    
#     Please provide a comprehensive analysis focusing on the main themes,
#     sentiment, and key takeaways.
    


# ## OUTPUT FORMAT REQUIREMENTS

# You must respond with a valid XML object that follows this exact structure:

# ### Schema: NewsArticle
# - **title**: str (required)
# - **summary**: str (required)
# - **category**: str (required)
# - **sentiment**: str (required)
# - **key_points**: Array of str (required)
# - **publication_date**: str (optional)


# ### XML Format Rules:
# - Use proper XML syntax with opening and closing tags
# - Root element should match the main dataclass name
# - Use snake_case for element names
# - For arrays, repeat the element name for each item
# - Use self-closing tags for null/empty optional fields
# - Include all required fields as elements
```

### 6. Real-World Use Cases

#### Processing Survey Responses

```python
@dataclass
class SurveyResponse:
    respondent_id: str
    age_group: str
    satisfaction_rating: int
    feedback: str
    would_recommend: bool
    improvement_areas: List[str]

# Simulating LLM processing of survey data
llm_survey_output = """
Based on the survey response, here's the extracted data:

\```json
{
    "respondent_id": "RESP_001",
    "age_group": "25-34", 
    "satisfaction_rating": 4,
    "feedback": "Great service overall, but could improve response time",
    "would_recommend": true,
    "improvement_areas": ["response_time", "pricing"]
}
\```

This response indicates positive sentiment with specific improvement suggestions.
"""

survey = extract(llm_survey_output, SurveyResponse, extract_strategy='json')
print(survey)
# SurveyResponse(respondent_id='RESP_001', age_group='25-34', satisfaction_rating=4, feedback='Great service overall, but could improve response time', would_recommend=True, improvement_areas=['response_time', 'pricing'])
```

#### E-commerce Product Extraction

```python
@dataclass
class ProductReview:
    product_id: str
    reviewer_name: str
    rating: int
    review_text: str
    verified_purchase: bool
    helpful_votes: int
    review_date: str

@structured_output(schema=ProductReview, strategy="xml")
def create_review_extraction_prompt(raw_review: str) -> str:
    return f"""
    Extract structured information from this product review:
    
    {raw_review}
    
    Pay attention to implicit ratings, sentiment, and any verification indicators.
    """

raw_review = """
โ˜…โ˜…โ˜…โ˜…โ˜† Amazing headphones! by John D. (Verified Purchase) - March 15, 2024
These headphones exceeded my expectations. Great sound quality and comfortable fit.
Battery life could be better but overall very satisfied. Would definitely buy again!
๐Ÿ‘ 47 people found this helpful
"""

extraction_prompt = create_review_extraction_prompt(raw_review)
# Send this prompt to your LLM, then extract the response:
# review = extract(llm_response, ProductReview, extract_strategy='xml')
```

## ๐Ÿ”ง Configuration Options

### Extraction Configuration

```python
from textlasso import extract_from_dict
import logging

# Configure extraction behavior
result = extract_from_dict(
    data_dict=your_data,
    target_class=YourDataClass,
    strict_mode=False,          # Allow type conversions
    ignore_extra_fields=True,   # Ignore unknown fields
    logger=custom_logger,       # Custom logging
    log_level=logging.DEBUG     # Detailed logging
)
```

### Prompt Generation Configuration

```python
from textlasso import generate_structured_prompt

prompt = generate_structured_prompt(
    prompt="Your base prompt",
    schema=YourSchema,
    strategy="json",                    # or "xml"
    include_schema_description=True,    # Include field descriptions
    example_count=3                     # Number of examples (1-3)
)
```

## ๐Ÿ“– API Reference

### Core Functions

#### `extract(text, target_class, extract_strategy='json')`
Extract structured data from text.

**Parameters:**
- `text` (str): Raw text containing data to extract
- `target_class` (type): Dataclass to convert data into
- `extract_strategy` (Literal['json', 'xml']): Extraction strategy

**Returns:** Instance of `target_class`

#### `extract_from_dict(data_dict, target_class, **options)`
Convert dictionary to dataclass with advanced options.

#### `generate_structured_prompt(prompt, schema, strategy, **options)`
Generate enhanced prompts with structure requirements.

### Decorators

#### `@structured_output(schema, strategy='json', **options)`
Enhance prompt functions with structured output requirements.

#### `@chain_prompts(*prompt_funcs, separator='\n\n---\n\n')`
Chain multiple prompt functions together.

#### `@prompt_cache(maxsize=128)`
Cache prompt results for better performance.

### Utilities

#### `clear_llm_res(text, extract_strategy)`
Clean LLM responses by removing code blocks and formatting.

## ๐Ÿค Contributing

We welcome contributions! Here's how to get started:

1. Fork the repository
2. Create a feature branch: `git checkout -b feature-name`
3. Make your changes and add tests
4. Run tests: `pytest`
5. Submit a pull request

## ๐Ÿ“„ License

This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.

## ๐Ÿ™ Acknowledgments

- Built for the AI/LLM community
- Inspired by the need for robust text processing in AI applications
- Special thanks to all contributors and users

## ๐Ÿ“ž Support

- ๐Ÿ“ง Email: aziznadirov@yahoo.com
- ๐Ÿ› Issues: [GitHub Issues](https://github.com/AzizNadirov/textlasso/issues)

---

**TextLasso** - Wrangle your text data with ease! ๐Ÿค 

            

Raw data

            {
    "_id": null,
    "home_page": null,
    "name": "textlasso",
    "maintainer": null,
    "docs_url": null,
    "requires_python": ">=3.9",
    "maintainer_email": "Aziz Nadirov <aziznadirov@yahoo.com>",
    "keywords": "llm, text, crawl, extract, text-cleaning",
    "author": null,
    "author_email": "Aziz Nadirov <aziznadirov@yahoo.com>",
    "download_url": "https://files.pythonhosted.org/packages/9d/12/f8a646dfce61ab0c187e1dbcea8119145e58d9732e23bf3926be61ccd396/textlasso-0.1.2.tar.gz",
    "platform": null,
    "description": "# TextLasso \ud83e\udd20\n\n[![PyPI version](https://badge.fury.io/py/textlasso.svg)](https://badge.fury.io/py/textlasso)\n[![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)\n[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)\n\n**TextLasso** is a simple Python library for extracting structured data from raw text, with special focus on processing LLM (Large Language Model) responses. Whether you're parsing JSON buried in markdown, extracting data from XML, or need to generate structured prompts for AI models, TextLasso has you covered.\n\n## \u2728 Key Features\n\n- \ud83c\udfaf **Smart Text Extraction**: Extract structured data from messy text with multiple fallback strategies\n- \ud83e\uddf9 **LLM Response Cleaning**: Automatically clean code blocks, markdown artifacts, and formatting\n- \ud83c\udfd7\ufe0f **Dataclass Integration**: Convert raw text directly to Python dataclasses with type validation\n- \ud83e\udd16 **AI Prompt Generation**: Generate structured prompts with schema validation and examples\n- \ud83d\udcca **Multiple Formats**: Support for JSON, XML, and extensible to other formats\n- \ud83d\udd27 **Flexible Configuration**: Configurable error handling, logging, and validation modes\n- \ud83c\udfa8 **Decorator Support**: Enhance existing functions with structured output capabilities\n\n## \ud83d\ude80 Quick Start\n\n### Installation\n\n```bash\npip install textlasso\n```\n\n### Basic Usage\n\n```python\nfrom dataclasses import dataclass\nfrom typing import List, Optional\nfrom textlasso import extract\n\n@dataclass\nclass Person:\n    name: str\n    age: int\n    email: Optional[str] = None\n    skills: List[str] = None\n\n# Extract from messy LLM response\nllm_response = \"\"\"\nHere's the person data you requested:\n\n\\```json\n{\n    \"name\": \"Alice Johnson\",\n    \"age\": 30,\n    \"email\": \"alice@company.com\", \n    \"skills\": [\"Python\", \"Machine Learning\", \"Data Science\"]\n}\n\\```\n\nHope this helps!\n\"\"\"\n\nperson = extract(llm_response, Person, extract_strategy='json')\nprint(f\"Extracted: {person.name}, {person.age} years old\")\nprint(person)\n# Extracted: Alice Johnson, 30 years old\n# Person(name='Alice Johnson', age=30, email='alice@company.com', skills=['Python', 'Machine Learning', 'Data Science'])\n```\n\n## \ud83d\udcda Comprehensive Examples\n\n### 1. Basic Text Extraction\n\n#### JSON Extraction with Fallback Strategies\n\n```python\nfrom dataclasses import dataclass\nfrom typing import List, Optional\nfrom textlasso import extract\n\n@dataclass\nclass Product:\n    name: str\n    price: float\n    category: str\n    in_stock: bool\n    tags: Optional[List[str]] = None\n\n# Works with clean JSON\nclean_json = '{\"name\": \"Laptop\", \"price\": 999.99, \"category\": \"Electronics\", \"in_stock\": true}'\n\n# Works with markdown-wrapped JSON\nmarkdown_json = \"\"\"\nHere's your product data:\n```json\n{\n    \"name\": \"Wireless Headphones\",\n    \"price\": 199.99,\n    \"category\": \"Electronics\", \n    \"in_stock\": false,\n    \"tags\": [\"wireless\", \"bluetooth\", \"noise-canceling\"]\n}\n\\```\n\"\"\"\n\n# Works with messy responses\nmessy_response = \"\"\"\nLet me extract that product information for you...\n\nThe product details are: {\"name\": \"Smart Watch\", \"price\": 299.99, \"category\": \"Wearables\", \"in_stock\": true}\n\nIs this what you were looking for?\n\"\"\"\n\n# All of these work automatically\nproducts = [\n    extract(clean_json, Product, extract_strategy='json'),\n    extract(markdown_json, Product, extract_strategy='json'), \n    extract(messy_response, Product, extract_strategy='json')\n]\n\nfor product in products:\n    print(f\"{product.name}: ${product.price} ({'\u2705' if product.in_stock else '\u274c'})\")\n```\n\n#### XML Extraction\n\n```python\nfrom dataclasses import dataclass\nfrom typing import List, Optional\nfrom textlasso import extract\n\n@dataclass \nclass Address:\n    street: str\n    city: str\n    country: str\n    zip_code: Optional[str] = None\n    \n@dataclass\nclass ResponseAddress:\n    address: Address\n\nxml_data = \"\"\"\n<address>\n    <street>123 Main St</street>\n    <city>San Francisco</city>\n    <country>USA</country>\n    <zip_code>94102</zip_code>\n</address>\n\"\"\"\n\nresponse_address = extract(xml_data, ResponseAddress, extract_strategy='xml')\nprint(f\"Address: {response_address.address.street}, {response_address.address.city}, {response_address.address.country}\")\n# Address: 123 Main St, San Francisco, USA\n```\n\n### 2. Complex Nested Data Structures\n\n```python\nfrom dataclasses import dataclass\nfrom typing import List, Optional\nfrom enum import Enum\n\nclass Department(Enum):\n    ENGINEERING = \"engineering\"\n    MARKETING = \"marketing\" \n    SALES = \"sales\"\n    HR = \"hr\"\n\n@dataclass\nclass Employee:\n    id: int\n    name: str\n    department: Department\n    salary: float\n    skills: List[str]\n    manager_id: Optional[int] = None\n\n@dataclass\nclass Company:\n    name: str\n    founded_year: int\n    employees: List[Employee]\n    headquarters: Address\n\ncomplex_json = \"\"\"\n{\n    \"name\": \"TechCorp Inc\",\n    \"founded_year\": 2015,\n    \"headquarters\": {\n        \"street\": \"100 Tech Plaza\",\n        \"city\": \"Austin\", \n        \"country\": \"USA\",\n        \"zip_code\": \"78701\"\n    },\n    \"employees\": [\n        {\n            \"id\": 1,\n            \"name\": \"Sarah Chen\", \n            \"department\": \"engineering\",\n            \"salary\": 120000,\n            \"skills\": [\"Python\", \"React\", \"AWS\"],\n            \"manager_id\": null\n        },\n        {\n            \"id\": 2,\n            \"name\": \"Mike Rodriguez\",\n            \"department\": \"marketing\", \n            \"salary\": 85000,\n            \"skills\": [\"SEO\", \"Content Strategy\", \"Analytics\"],\n            \"manager_id\": 1\n        }\n    ]\n}\n\"\"\"\n\ncompany = extract(complex_json, Company, extract_strategy='json')\nprint(f\"Company: {company.name} ({company.founded_year})\")\nprint(f\"HQ: {company.headquarters.city}, {company.headquarters.country}\")\nprint(f\"Employees: {len(company.employees)}\")\n\nfor emp in company.employees:\n    print(f\"  - {emp.name} ({emp.department.value}): {', '.join(emp.skills)}\")\n\n# HQ: Austin, USA\n# Employees: 2\n#   - Sarah Chen (engineering): Python, React, AWS\n#   - Mike Rodriguez (marketing): SEO, Content Strategy, Analytics\n```\n\n### 3. LLM Response Cleaning\n\n```python\nfrom textlasso.cleaners import clear_llm_res\n\n# Clean various LLM response formats\nmessy_responses = [\n    \"\\```json\\\\n{\\\"key\\\": \\\"value\\\"}\\\\n\\```\",\n    \"\\```\\\\n{\\\"key\\\": \\\"value\\\"}\\\\n\\```\", \n    \"Here's the data: {\\\"key\\\": \\\"value\\\"} hope it helps!\",\n    \"\\```xml\\\\n<root><item>data</item></root>\\\\n\\```\"\n]\n\nfor response in messy_responses:\n    clean_json = clear_llm_res(response, extract_strategy='json')\n    clean_xml = clear_llm_res(response, extract_strategy='xml')\n    print(f\"Original: {response}\")\n    print(f\"JSON cleaned: {clean_json}\")\n    print(f\"XML cleaned: {clean_xml}\")\n    print(\"---\")\n```\n\n### 4. Advanced Data Extraction with Configuration\n\n```python\nfrom textlasso import extract_from_dict\nimport logging\n\n# Configure custom logging\nlogger = logging.getLogger(\"my_extractor\")\nlogger.setLevel(logging.DEBUG)\n\n@dataclass\nclass FlexibleData:\n    required_field: str\n    optional_field: Optional[str] = None\n    number_field: int = 0\n\n# Strict mode - raises errors on type mismatches\ndata_with_extra = {\n    \"required_field\": \"test\",\n    \"optional_field\": \"optional\", \n    \"number_field\": \"123\",  # String instead of int\n    \"extra_field\": \"ignored\"  # Extra field\n}\n\n# Strict mode (default)\ntry:\n    result_strict = extract_from_dict(\n        data_with_extra, \n        FlexibleData,\n        strict_mode=True,\n        ignore_extra_fields=True,\n        logger=logger\n    )\n    print(\"Strict mode result:\", result_strict)\nexcept Exception as e:\n    print(\"Strict mode error:\", e)\n\n# Flexible mode - attempts conversion\nresult_flexible = extract_from_dict(\n    data_with_extra,\n    FlexibleData, \n    strict_mode=False,\n    ignore_extra_fields=True,\n    logger=logger\n)\nprint(\"Flexible mode result:\", result_flexible)\n```\n\n### 5. Structured Prompt Generation\n\n#### Basic Prompt Generation\n\n```python\nfrom textlasso import generate_structured_prompt\n\n@dataclass\nclass UserFeedback:\n    rating: int  # 1-5\n    comment: str\n    category: str\n    recommended: bool\n    issues: Optional[List[str]] = None\n\n# Generate a structured prompt\nprompt = generate_structured_prompt(\n    prompt=\"Analyze this customer review and extract structured feedback\",\n    schema=UserFeedback,\n    strategy=\"json\",\n    include_schema_description=True,\n    example_count=2\n)\n\nprint(prompt)\n# Output:\n# Analyze this customer review and extract structured feedback\n\n# ## OUTPUT FORMAT REQUIREMENTS\n\n# You must respond with a valid JSON object that follows this exact structure:\n\n# ### Schema: UserFeedback\n# - **rating**: int (required)\n# - **comment**: str (required)\n# - **category**: str (required)\n# - **recommended**: bool (required)\n# - **issues**: Array of str (optional)\n\n\n# ### JSON Format Rules:\n# - Use proper JSON syntax with double quotes for strings\n# - Include all required fields\n# - Use null for optional fields that are not provided\n# - Arrays should contain objects matching the specified structure\n# - Numbers should not be quoted\n# - Booleans should be true/false (not quoted)\n\n\n# ## EXAMPLES\n\n# Here are 2 examples of the expected JSON format:\n\n# ### Example 1:\n# ```json\n# {\n#   \"rating\": 1,\n#   \"comment\": \"example_comment_1\",\n#   \"category\": \"example_category_1\",\n#   \"recommended\": true,\n#   \"issues\": [\n#     \"example_issues_item_1\",\n#     \"example_issues_item_2\"\n#   ]\n# }\n# ```\n\n# ### Example 2:\n# ```json\n# {\n#   \"rating\": 2,\n#   \"comment\": \"example_comment_2\",\n#   \"category\": \"example_category_2\",\n#   \"recommended\": false,\n#   \"issues\": [\n#     \"example_issues_item_1\",\n#     \"example_issues_item_2\",\n#     \"example_issues_item_3\"\n#   ]\n# }\n# ```\n\n# Remember: Your response must be valid JSON that matches the specified structure exactly.\n```\n\n#### Using the Decorator for Function Enhancement\nIf you have a prompt returning functions, you can use the `@structured_output` decorator to automatically enhance your prompts with structure requirements.\n\n```python\n\nfrom dataclasses import dataclass\nfrom typing import Optional, List\n\nfrom textlasso import structured_output\n\n@dataclass\nclass NewsArticle:\n    title: str\n    summary: str\n    category: str\n    sentiment: str\n    key_points: List[str]\n    publication_date: Optional[str] = None\n\n# decorate prompt-returning function\n@structured_output(schema=NewsArticle, strategy=\"xml\", example_count=1)\ndef create_article_analysis_prompt(article_text: str) -> str:\n    return f\"\"\"\n    Analyze the following news article and extract key information:\n    \n    Article: {article_text}\n    \n    Please provide a comprehensive analysis focusing on the main themes,\n    sentiment, and key takeaways.\n    \"\"\"\n\n# The decorator automatically enhances your prompt with structure requirements\narticle_text = \"Breaking: New AI breakthrough announced by researchers...\"\nenhanced_prompt = create_article_analysis_prompt(article_text)\n\n# This prompt now includes schema definitions, examples, and format requirements\nprint(\"Enhanced prompt: \", enhanced_prompt)\n\n# Enhanced prompt:  \n#     Analyze the following news article and extract key information:\n    \n#     Article: Breaking: New AI breakthrough announced by researchers...\n    \n#     Please provide a comprehensive analysis focusing on the main themes,\n#     sentiment, and key takeaways.\n    \n\n\n# ## OUTPUT FORMAT REQUIREMENTS\n\n# You must respond with a valid XML object that follows this exact structure:\n\n# ### Schema: NewsArticle\n# - **title**: str (required)\n# - **summary**: str (required)\n# - **category**: str (required)\n# - **sentiment**: str (required)\n# - **key_points**: Array of str (required)\n# - **publication_date**: str (optional)\n\n\n# ### XML Format Rules:\n# - Use proper XML syntax with opening and closing tags\n# - Root element should match the main dataclass name\n# - Use snake_case for element names\n# - For arrays, repeat the element name for each item\n# - Use self-closing tags for null/empty optional fields\n# - Include all required fields as elements\n```\n\n### 6. Real-World Use Cases\n\n#### Processing Survey Responses\n\n```python\n@dataclass\nclass SurveyResponse:\n    respondent_id: str\n    age_group: str\n    satisfaction_rating: int\n    feedback: str\n    would_recommend: bool\n    improvement_areas: List[str]\n\n# Simulating LLM processing of survey data\nllm_survey_output = \"\"\"\nBased on the survey response, here's the extracted data:\n\n\\```json\n{\n    \"respondent_id\": \"RESP_001\",\n    \"age_group\": \"25-34\", \n    \"satisfaction_rating\": 4,\n    \"feedback\": \"Great service overall, but could improve response time\",\n    \"would_recommend\": true,\n    \"improvement_areas\": [\"response_time\", \"pricing\"]\n}\n\\```\n\nThis response indicates positive sentiment with specific improvement suggestions.\n\"\"\"\n\nsurvey = extract(llm_survey_output, SurveyResponse, extract_strategy='json')\nprint(survey)\n# SurveyResponse(respondent_id='RESP_001', age_group='25-34', satisfaction_rating=4, feedback='Great service overall, but could improve response time', would_recommend=True, improvement_areas=['response_time', 'pricing'])\n```\n\n#### E-commerce Product Extraction\n\n```python\n@dataclass\nclass ProductReview:\n    product_id: str\n    reviewer_name: str\n    rating: int\n    review_text: str\n    verified_purchase: bool\n    helpful_votes: int\n    review_date: str\n\n@structured_output(schema=ProductReview, strategy=\"xml\")\ndef create_review_extraction_prompt(raw_review: str) -> str:\n    return f\"\"\"\n    Extract structured information from this product review:\n    \n    {raw_review}\n    \n    Pay attention to implicit ratings, sentiment, and any verification indicators.\n    \"\"\"\n\nraw_review = \"\"\"\n\u2605\u2605\u2605\u2605\u2606 Amazing headphones! by John D. (Verified Purchase) - March 15, 2024\nThese headphones exceeded my expectations. Great sound quality and comfortable fit.\nBattery life could be better but overall very satisfied. Would definitely buy again!\n\ud83d\udc4d 47 people found this helpful\n\"\"\"\n\nextraction_prompt = create_review_extraction_prompt(raw_review)\n# Send this prompt to your LLM, then extract the response:\n# review = extract(llm_response, ProductReview, extract_strategy='xml')\n```\n\n## \ud83d\udd27 Configuration Options\n\n### Extraction Configuration\n\n```python\nfrom textlasso import extract_from_dict\nimport logging\n\n# Configure extraction behavior\nresult = extract_from_dict(\n    data_dict=your_data,\n    target_class=YourDataClass,\n    strict_mode=False,          # Allow type conversions\n    ignore_extra_fields=True,   # Ignore unknown fields\n    logger=custom_logger,       # Custom logging\n    log_level=logging.DEBUG     # Detailed logging\n)\n```\n\n### Prompt Generation Configuration\n\n```python\nfrom textlasso import generate_structured_prompt\n\nprompt = generate_structured_prompt(\n    prompt=\"Your base prompt\",\n    schema=YourSchema,\n    strategy=\"json\",                    # or \"xml\"\n    include_schema_description=True,    # Include field descriptions\n    example_count=3                     # Number of examples (1-3)\n)\n```\n\n## \ud83d\udcd6 API Reference\n\n### Core Functions\n\n#### `extract(text, target_class, extract_strategy='json')`\nExtract structured data from text.\n\n**Parameters:**\n- `text` (str): Raw text containing data to extract\n- `target_class` (type): Dataclass to convert data into\n- `extract_strategy` (Literal['json', 'xml']): Extraction strategy\n\n**Returns:** Instance of `target_class`\n\n#### `extract_from_dict(data_dict, target_class, **options)`\nConvert dictionary to dataclass with advanced options.\n\n#### `generate_structured_prompt(prompt, schema, strategy, **options)`\nGenerate enhanced prompts with structure requirements.\n\n### Decorators\n\n#### `@structured_output(schema, strategy='json', **options)`\nEnhance prompt functions with structured output requirements.\n\n#### `@chain_prompts(*prompt_funcs, separator='\\n\\n---\\n\\n')`\nChain multiple prompt functions together.\n\n#### `@prompt_cache(maxsize=128)`\nCache prompt results for better performance.\n\n### Utilities\n\n#### `clear_llm_res(text, extract_strategy)`\nClean LLM responses by removing code blocks and formatting.\n\n## \ud83e\udd1d Contributing\n\nWe welcome contributions! Here's how to get started:\n\n1. Fork the repository\n2. Create a feature branch: `git checkout -b feature-name`\n3. Make your changes and add tests\n4. Run tests: `pytest`\n5. Submit a pull request\n\n## \ud83d\udcc4 License\n\nThis project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.\n\n## \ud83d\ude4f Acknowledgments\n\n- Built for the AI/LLM community\n- Inspired by the need for robust text processing in AI applications\n- Special thanks to all contributors and users\n\n## \ud83d\udcde Support\n\n- \ud83d\udce7 Email: aziznadirov@yahoo.com\n- \ud83d\udc1b Issues: [GitHub Issues](https://github.com/AzizNadirov/textlasso/issues)\n\n---\n\n**TextLasso** - Wrangle your text data with ease! \ud83e\udd20\n",
    "bugtrack_url": null,
    "license": null,
    "summary": "Simple packego for grab data from raw text. ",
    "version": "0.1.2",
    "project_urls": {
        "Homepage": "https://github.com/AzizNadirov/textlasso",
        "Repository": "https://github.com/AzizNadirov/textlasso.git"
    },
    "split_keywords": [
        "llm",
        " text",
        " crawl",
        " extract",
        " text-cleaning"
    ],
    "urls": [
        {
            "comment_text": null,
            "digests": {
                "blake2b_256": "3a4729c1f190d28d5bdb40326338f2b178b0c2f560305845d61aa77b9981cec4",
                "md5": "44eeb712c5870f1d25f1eda0adc969d7",
                "sha256": "f64aeda09b33fe3df13f049a901c8bd82554be7c0de4bf5c64512367d986bfbc"
            },
            "downloads": -1,
            "filename": "textlasso-0.1.2-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "44eeb712c5870f1d25f1eda0adc969d7",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": ">=3.9",
            "size": 22243,
            "upload_time": "2025-07-26T14:44:29",
            "upload_time_iso_8601": "2025-07-26T14:44:29.789551Z",
            "url": "https://files.pythonhosted.org/packages/3a/47/29c1f190d28d5bdb40326338f2b178b0c2f560305845d61aa77b9981cec4/textlasso-0.1.2-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": null,
            "digests": {
                "blake2b_256": "9d12f8a646dfce61ab0c187e1dbcea8119145e58d9732e23bf3926be61ccd396",
                "md5": "7205d492a24bd16de79a38488b8a1827",
                "sha256": "de897a5b689c59d649ebfb439c86bb5b5ed22d401adc8177da586b94028bdc9f"
            },
            "downloads": -1,
            "filename": "textlasso-0.1.2.tar.gz",
            "has_sig": false,
            "md5_digest": "7205d492a24bd16de79a38488b8a1827",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": ">=3.9",
            "size": 25981,
            "upload_time": "2025-07-26T14:44:31",
            "upload_time_iso_8601": "2025-07-26T14:44:31.242131Z",
            "url": "https://files.pythonhosted.org/packages/9d/12/f8a646dfce61ab0c187e1dbcea8119145e58d9732e23bf3926be61ccd396/textlasso-0.1.2.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2025-07-26 14:44:31",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "AzizNadirov",
    "github_project": "textlasso",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": false,
    "lcname": "textlasso"
}
        
Elapsed time: 2.09145s