# XML Analysis Framework
[](https://www.python.org/downloads/)
[](https://opensource.org/licenses/MIT)
[](./test_results)
[](./src/handlers)
[](./AI_INTEGRATION_ARCHITECTURE.md)
A production-ready XML document analysis and preprocessing framework with **29 specialized handlers** designed for AI/ML data pipelines. Transform any XML document into structured, AI-ready data and optimized chunks with **100% success rate** across 71 diverse test files.
## ๐ Part of Analysis Framework Suite
This framework is part of a unified suite of document analysis tools that share a consistent interface:
- **[analysis-framework-base](https://pypi.org/project/analysis-framework-base/)** - Base interfaces and shared models
- **[xml-analysis-framework](https://pypi.org/project/xml-analysis-framework/)** - XML document analysis (this package)
- **[docling-analysis-framework](https://pypi.org/project/docling-analysis-framework/)** - PDF/Office documents via Docling
- **[document-analysis-framework](https://pypi.org/project/document-analysis-framework/)** - Text, code, config files
- **[data-analysis-framework](https://pypi.org/project/data-analysis-framework/)** - Structured data analysis
All frameworks implement the same `BaseAnalyzer` and `BaseChunker` interfaces from `analysis-framework-base`, enabling:
- **Consistent API** across document types
- **Easy framework switching** with minimal code changes
- **Unified result format** for downstream processing
- **Shared tooling** and utilities
## ๐ Quick Start
### Simple API - Get Started in Seconds
```python
import xml_analysis_framework as xaf
# ๐ฏ One-line analysis with specialized handlers
result = xaf.analyze("path/to/file.xml")
print(f"Document type: {result['document_type'].type_name}")
print(f"Handler used: {result['handler_used']}")
# ๐ Basic schema analysis
schema = xaf.analyze_schema("path/to/file.xml")
print(f"Elements: {schema.total_elements}, Depth: {schema.max_depth}")
# โ๏ธ Smart chunking for AI/ML
chunks = xaf.chunk("path/to/file.xml", strategy="auto")
print(f"Created {len(chunks)} optimized chunks")
# ๐พ Save chunks to JSON
import json
# Convert chunks to JSON-serializable format
chunks_data = [
{
"chunk_id": chunk.chunk_id,
"content": chunk.content,
"element_path": chunk.element_path,
"start_line": chunk.start_line,
"end_line": chunk.end_line,
"elements_included": chunk.elements_included,
"metadata": chunk.metadata,
"token_estimate": chunk.token_estimate
}
for chunk in chunks
]
# Write to file
with open("chunks_output.json", "w") as f:
json.dump(chunks_data, f, indent=2)
```
### Advanced Usage
```python
import xml_analysis_framework as xaf
# Enhanced analysis with full results
analysis = xaf.analyze_enhanced("document.xml")
print(f"Type: {analysis.type_name} (confidence: {analysis.confidence:.2f})")
print(f"AI use cases: {len(analysis.ai_use_cases)}")
if analysis.quality_metrics:
print(f"Quality score: {analysis.quality_metrics.get('completeness_score')}")
else:
print("Quality metrics: Not available")
# Different chunking strategies
hierarchical_chunks = xaf.chunk("document.xml", strategy="hierarchical")
sliding_chunks = xaf.chunk("document.xml", strategy="sliding_window")
content_chunks = xaf.chunk("document.xml", strategy="content_aware")
# Process chunks
for chunk in hierarchical_chunks:
print(f"Chunk {chunk.chunk_id}: {len(chunk.content)} chars")
print(f"Path: {chunk.element_path}, Elements: {len(chunk.elements_included)}")
# ๐พ Save different chunking strategies to separate files
import json
# Helper function to convert chunk to dict
def chunk_to_dict(chunk):
return {
"chunk_id": chunk.chunk_id,
"content": chunk.content,
"element_path": chunk.element_path,
"start_line": chunk.start_line,
"end_line": chunk.end_line,
"elements_included": chunk.elements_included,
"metadata": chunk.metadata,
"token_estimate": chunk.token_estimate
}
# Save each strategy's results
strategies = {
"hierarchical": hierarchical_chunks,
"sliding_window": sliding_chunks,
"content_aware": content_chunks
}
for strategy_name, chunks in strategies.items():
chunks_data = [chunk_to_dict(chunk) for chunk in chunks]
with open(f"chunks_{strategy_name}.json", "w") as f:
json.dump({
"strategy": strategy_name,
"total_chunks": len(chunks_data),
"chunks": chunks_data
}, f, indent=2)
print(f"Saved {len(chunks_data)} chunks to chunks_{strategy_name}.json")
```
### Expert Usage - Direct Class Access
```python
# For advanced customization, use the classes directly
from xml_analysis_framework import XMLDocumentAnalyzer, ChunkingOrchestrator
analyzer = XMLDocumentAnalyzer(max_file_size_mb=500)
orchestrator = ChunkingOrchestrator(max_file_size_mb=1000)
# Custom analysis
result = analyzer.analyze_document("file.xml")
# Custom chunking with config (result works directly now!)
from xml_analysis_framework.core.chunking import ChunkingConfig
config = ChunkingConfig(
max_chunk_size=2000,
min_chunk_size=300,
overlap_size=150,
preserve_hierarchy=True
)
chunks = orchestrator.chunk_document("file.xml", result, strategy="auto", config=config)
# ๐พ Save with analysis metadata
import json
from datetime import datetime
output_data = {
"metadata": {
"file": "file.xml",
"processed_at": datetime.now().isoformat(),
"document_type": result.type_name,
"confidence": result.confidence,
"handler_used": result.handler_used,
"chunking_config": {
"strategy": "auto",
"max_chunk_size": config.max_chunk_size,
"min_chunk_size": config.min_chunk_size,
"overlap_size": config.overlap_size,
"preserve_hierarchy": config.preserve_hierarchy
}
},
"analysis": {
"ai_use_cases": result.ai_use_cases,
"key_findings": result.key_findings,
"quality_metrics": result.quality_metrics
},
"chunks": [
{
"chunk_id": chunk.chunk_id,
"content": chunk.content,
"element_path": chunk.element_path,
"start_line": chunk.start_line,
"end_line": chunk.end_line,
"elements_included": chunk.elements_included,
"metadata": chunk.metadata,
"token_estimate": chunk.token_estimate
}
for chunk in chunks
]
}
with open("analysis_and_chunks.json", "w") as f:
json.dump(output_data, f, indent=2)
print(f"Saved complete analysis with {len(chunks)} chunks to analysis_and_chunks.json")
```
## ๐ฏ Key Features
### 1. **๐ง 29 Specialized XML Handlers**
Automatically detects and analyzes different XML document types:
- **Security & Compliance**: SCAP, SAML, SOAP
- **DevOps & Build**: Maven POM, Ant, Ivy, Spring, Log4j
- **Content & Documentation**: RSS/Atom, DocBook, XHTML, SVG
- **Enterprise Systems**: ServiceNow, Hibernate, Struts configurations
- **Data & APIs**: GPX, KML, GraphML, WADL/WSDL, XML Schemas
### 2. **โก Intelligent Chunking Strategies**
- **Hierarchical**: Preserves document structure and relationships
- **Sliding Window**: Fixed-size chunks with configurable overlap
- **Content-Aware**: Groups related content based on semantic meaning
- **Auto-Selection**: Automatically chooses best strategy based on document type
### 3. **๐ค AI/ML Ready Output**
- **Token-Optimized**: Chunks sized for LLM context windows
- **Rich Metadata**: Each chunk includes context, line numbers, and relationships
- **JSON Export**: Easy integration with vector stores and AI pipelines
- **Quality Metrics**: Automated assessment of data completeness and structure
### 4. **๐ Enterprise Security**
- **Safe XML Parsing**: Uses defusedxml to prevent XXE attacks
- **File Size Limits**: Configurable limits to prevent resource exhaustion
- **Minimal Dependencies**: Only defusedxml + Python standard library
## ๐ Supported Document Types
| Category | Document Types | Common Use Cases |
| -------- | -------------- | ---------------- |
| **Security & Compliance** | SCAP, SAML, SOAP | Vulnerability scanning, authentication, web services |
| **Build & Configuration** | Maven POM, Ant, Spring, Log4j | Dependency management, build automation, app config |
| **Enterprise Systems** | ServiceNow, Hibernate, Struts | IT service management, ORM mapping, web frameworks |
| **Content & Media** | RSS/Atom, DocBook, XHTML, SVG | Feeds, documentation, web content, graphics |
| **Geospatial** | GPX, KML, GraphML | GPS tracking, maps, network graphs |
| **APIs & Services** | WADL, WSDL, OpenAPI | REST APIs, SOAP services, API documentation |
| **Data Exchange** | XLIFF, XML Sitemap, Generic XML | Translations, SEO, custom formats |
## ๐ Security
### XML Security Protection
This framework uses **defusedxml** to protect against common XML security vulnerabilities:
- **XXE (XML External Entity) attacks**: Prevents reading local files or making network requests
- **Billion Laughs attack**: Prevents exponential entity expansion DoS attacks
- **DTD retrieval**: Blocks external DTD fetching to prevent data exfiltration
#### Security Features
```python
import xml_analysis_framework as xaf
# Safe parsing - malicious XML will be rejected automatically
try:
result = xaf.analyze("potentially_malicious.xml")
except Exception as e:
print(f"Security threat detected: {e}")
# The framework automatically protects against:
# - XXE attacks
# - Billion laughs / exponential entity expansion
# - External DTD retrieval
```
#### Best Practices
1. **Always use the framework's parsers** - Never use `xml.etree.ElementTree` directly
2. **Validate file sizes** - Set reasonable limits for your use case
3. **Sanitize file paths** - Ensure input paths are properly validated
4. **Monitor for security exceptions** - Log and alert on security-blocked parsing attempts
### File Size Limits
The framework includes built-in file size limits to prevent memory exhaustion:
```python
import xml_analysis_framework as xaf
from xml_analysis_framework import XMLDocumentAnalyzer, ChunkingOrchestrator
# Default limits are reasonable for most use cases
# But you can customize them:
# Create analyzer with custom 50MB limit
analyzer = XMLDocumentAnalyzer(max_file_size_mb=50.0)
result = analyzer.analyze_document("large_file.xml")
# Create chunking orchestrator with 100MB limit
orchestrator = ChunkingOrchestrator(max_file_size_mb=100.0)
chunks = orchestrator.chunk_document("large_file.xml", result)
# For simple API, defaults are used automatically
try:
result = xaf.analyze("very_large_file.xml")
except ValueError as e:
print(f"File too large: {e}")
```
## ๐ง Installation
```bash
# Install from PyPI (recommended)
pip install xml-analysis-framework
# Install from source
git clone https://github.com/redhat-ai-americas/xml-analysis-framework.git
cd xml-analysis-framework
pip install -e .
# Or install development dependencies
pip install -e .[dev]
```
### Dependencies
- **defusedxml** (0.7.1+): For secure XML parsing protection
- Python standard library (3.8+) for all other functionality
## ๐งช Testing
The framework includes comprehensive tests for all handlers and features:
```bash
# Run all tests
python -m pytest tests/
# Run specific test categories
python -m pytest tests/unit/ # Unit tests for handlers
python -m pytest tests/integration/ # Integration tests
python -m pytest tests/comprehensive/ # Full system tests
```
## ๐ค AI/ML Integration
### AI Processing Pipeline
```
XML Documents โ Analysis Framework โ Structured Output โ AI/ML Systems
1. Document Analysis (29 specialized handlers)
2. Smart Chunking (token-optimized)
3. JSON Export (with metadata)
4. Integration with:
- Vector databases (semantic search)
- LLMs (document Q&A, analysis)
- Graph databases (relationship mapping)
- ML pipelines (feature extraction)
```
### Common AI Use Cases
- **Security Intelligence**: Analyze SCAP reports, detect vulnerabilities, compliance monitoring
- **DevOps Automation**: Dependency analysis, configuration validation, build optimization
- **Enterprise Search**: Semantic search across technical documentation and configurations
- **Knowledge Extraction**: Extract structured data from XML for ML training datasets
## ๐ Extending the Framework
### Adding New Handlers
```python
from xml_analysis_framework.base import XMLHandler, SpecializedAnalysis, DocumentTypeInfo
class CustomHandler(XMLHandler):
def can_handle_xml(self, root, namespaces):
# Check if this handler can process the document
if root.tag == 'custom-format':
return True, 1.0 # (can_handle, confidence)
return False, 0.0
def detect_xml_type(self, root, namespaces):
return DocumentTypeInfo(
type_name="Custom Format",
confidence=1.0,
version="1.0"
)
def analyze_xml(self, root, file_path):
return SpecializedAnalysis(
type_name="Custom Format",
confidence=1.0,
key_findings={"custom_data": "value"},
ai_use_cases=["Custom AI application"],
structured_data={"extracted": "data"},
file_path=file_path,
handler_used="CustomHandler"
)
def extract_xml_key_data(self, root):
# Extract key data specific to your format
return {"key": "value"}
```
### Custom Chunking Strategies
```python
from xml_analysis_framework.core.chunking import XMLChunkingStrategy, XMLChunk
import xml.etree.ElementTree as ET
class CustomChunking(XMLChunkingStrategy):
def chunk_document(self, file_path, specialized_analysis=None):
chunks = []
tree = ET.parse(file_path)
root = tree.getroot()
# Custom chunking logic
for i, element in enumerate(root):
chunk = XMLChunk(
chunk_id=f"custom_{i}",
content=ET.tostring(element, encoding='unicode'),
element_path=f"/{element.tag}",
start_line=1,
end_line=10,
parent_context=None,
metadata={"custom": True},
token_estimate=100,
elements_included=[element.tag]
)
chunks.append(chunk)
return chunks
# Use with the framework
import xml_analysis_framework as xaf
from xml_analysis_framework import ChunkingOrchestrator
orchestrator = ChunkingOrchestrator()
# The orchestrator will use your custom strategy when needed
```
## ๐ค Contributing
We welcome contributions! Whether you're adding new XML handlers, improving chunking algorithms, or enhancing AI integrations, your contributions help make XML analysis more accessible and powerful.
**Priority contribution areas:**
- ๐ฏ New XML format handlers (ERP, CRM, healthcare, government)
- โก Enhanced chunking algorithms and strategies
- ๐ Performance optimizations for large files
- ๐ค Advanced AI/ML integration examples
- ๐ Documentation and usage examples
**๐ See [CONTRIBUTING.md](CONTRIBUTING.md) for complete guidelines, development setup, and submission process.**
## ๐ License
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
## ๐ Acknowledgments
- Designed as part of the **AI Building Blocks** initiative
- Built for the modern AI/ML ecosystem
- Community-driven XML format support
Raw data
{
"_id": null,
"home_page": "https://github.com/redhat-ai-americas/xml-analysis-framework",
"name": "xml-analysis-framework",
"maintainer": null,
"docs_url": null,
"requires_python": ">=3.8",
"maintainer_email": null,
"keywords": "xml, analysis, ai, ml, document-processing, semantic-search",
"author": "Wes Jackson",
"author_email": "AI Building Blocks <wjackson@redhat.com>",
"download_url": "https://files.pythonhosted.org/packages/91/f8/d87bec1702a9e5aa6992567f4fa5f6630b6221070f93a25a031f9ee64ee3/xml_analysis_framework-2.0.0.tar.gz",
"platform": null,
"description": "# XML Analysis Framework\n\n[](https://www.python.org/downloads/)\n[](https://opensource.org/licenses/MIT)\n[](./test_results)\n[](./src/handlers)\n[](./AI_INTEGRATION_ARCHITECTURE.md)\n\nA production-ready XML document analysis and preprocessing framework with **29 specialized handlers** designed for AI/ML data pipelines. Transform any XML document into structured, AI-ready data and optimized chunks with **100% success rate** across 71 diverse test files.\n\n## \ud83d\udd17 Part of Analysis Framework Suite\n\nThis framework is part of a unified suite of document analysis tools that share a consistent interface:\n\n- **[analysis-framework-base](https://pypi.org/project/analysis-framework-base/)** - Base interfaces and shared models\n- **[xml-analysis-framework](https://pypi.org/project/xml-analysis-framework/)** - XML document analysis (this package)\n- **[docling-analysis-framework](https://pypi.org/project/docling-analysis-framework/)** - PDF/Office documents via Docling\n- **[document-analysis-framework](https://pypi.org/project/document-analysis-framework/)** - Text, code, config files\n- **[data-analysis-framework](https://pypi.org/project/data-analysis-framework/)** - Structured data analysis\n\nAll frameworks implement the same `BaseAnalyzer` and `BaseChunker` interfaces from `analysis-framework-base`, enabling:\n- **Consistent API** across document types\n- **Easy framework switching** with minimal code changes\n- **Unified result format** for downstream processing\n- **Shared tooling** and utilities\n\n## \ud83d\ude80 Quick Start\n\n### Simple API - Get Started in Seconds\n\n```python\nimport xml_analysis_framework as xaf\n\n# \ud83c\udfaf One-line analysis with specialized handlers\nresult = xaf.analyze(\"path/to/file.xml\")\nprint(f\"Document type: {result['document_type'].type_name}\")\nprint(f\"Handler used: {result['handler_used']}\")\n\n# \ud83d\udcca Basic schema analysis \nschema = xaf.analyze_schema(\"path/to/file.xml\")\nprint(f\"Elements: {schema.total_elements}, Depth: {schema.max_depth}\")\n\n# \u2702\ufe0f Smart chunking for AI/ML\nchunks = xaf.chunk(\"path/to/file.xml\", strategy=\"auto\")\nprint(f\"Created {len(chunks)} optimized chunks\")\n\n# \ud83d\udcbe Save chunks to JSON\nimport json\n\n# Convert chunks to JSON-serializable format\nchunks_data = [\n {\n \"chunk_id\": chunk.chunk_id,\n \"content\": chunk.content,\n \"element_path\": chunk.element_path,\n \"start_line\": chunk.start_line,\n \"end_line\": chunk.end_line,\n \"elements_included\": chunk.elements_included,\n \"metadata\": chunk.metadata,\n \"token_estimate\": chunk.token_estimate\n }\n for chunk in chunks\n]\n\n# Write to file\nwith open(\"chunks_output.json\", \"w\") as f:\n json.dump(chunks_data, f, indent=2)\n```\n\n### Advanced Usage\n\n```python\nimport xml_analysis_framework as xaf\n\n# Enhanced analysis with full results\nanalysis = xaf.analyze_enhanced(\"document.xml\")\n\nprint(f\"Type: {analysis.type_name} (confidence: {analysis.confidence:.2f})\")\nprint(f\"AI use cases: {len(analysis.ai_use_cases)}\")\nif analysis.quality_metrics:\n print(f\"Quality score: {analysis.quality_metrics.get('completeness_score')}\")\nelse:\n print(\"Quality metrics: Not available\")\n\n# Different chunking strategies\nhierarchical_chunks = xaf.chunk(\"document.xml\", strategy=\"hierarchical\")\nsliding_chunks = xaf.chunk(\"document.xml\", strategy=\"sliding_window\") \ncontent_chunks = xaf.chunk(\"document.xml\", strategy=\"content_aware\")\n\n# Process chunks\nfor chunk in hierarchical_chunks:\n print(f\"Chunk {chunk.chunk_id}: {len(chunk.content)} chars\")\n print(f\"Path: {chunk.element_path}, Elements: {len(chunk.elements_included)}\")\n\n# \ud83d\udcbe Save different chunking strategies to separate files\nimport json\n\n# Helper function to convert chunk to dict\ndef chunk_to_dict(chunk):\n return {\n \"chunk_id\": chunk.chunk_id,\n \"content\": chunk.content,\n \"element_path\": chunk.element_path,\n \"start_line\": chunk.start_line,\n \"end_line\": chunk.end_line,\n \"elements_included\": chunk.elements_included,\n \"metadata\": chunk.metadata,\n \"token_estimate\": chunk.token_estimate\n }\n\n# Save each strategy's results\nstrategies = {\n \"hierarchical\": hierarchical_chunks,\n \"sliding_window\": sliding_chunks,\n \"content_aware\": content_chunks\n}\n\nfor strategy_name, chunks in strategies.items():\n chunks_data = [chunk_to_dict(chunk) for chunk in chunks]\n \n with open(f\"chunks_{strategy_name}.json\", \"w\") as f:\n json.dump({\n \"strategy\": strategy_name,\n \"total_chunks\": len(chunks_data),\n \"chunks\": chunks_data\n }, f, indent=2)\n \n print(f\"Saved {len(chunks_data)} chunks to chunks_{strategy_name}.json\")\n```\n\n### Expert Usage - Direct Class Access\n\n```python\n# For advanced customization, use the classes directly\nfrom xml_analysis_framework import XMLDocumentAnalyzer, ChunkingOrchestrator\n\nanalyzer = XMLDocumentAnalyzer(max_file_size_mb=500)\norchestrator = ChunkingOrchestrator(max_file_size_mb=1000)\n\n# Custom analysis\nresult = analyzer.analyze_document(\"file.xml\")\n\n# Custom chunking with config (result works directly now!)\nfrom xml_analysis_framework.core.chunking import ChunkingConfig\nconfig = ChunkingConfig(\n max_chunk_size=2000,\n min_chunk_size=300,\n overlap_size=150,\n preserve_hierarchy=True\n)\nchunks = orchestrator.chunk_document(\"file.xml\", result, strategy=\"auto\", config=config)\n\n# \ud83d\udcbe Save with analysis metadata\nimport json\nfrom datetime import datetime\n\noutput_data = {\n \"metadata\": {\n \"file\": \"file.xml\",\n \"processed_at\": datetime.now().isoformat(),\n \"document_type\": result.type_name,\n \"confidence\": result.confidence,\n \"handler_used\": result.handler_used,\n \"chunking_config\": {\n \"strategy\": \"auto\",\n \"max_chunk_size\": config.max_chunk_size,\n \"min_chunk_size\": config.min_chunk_size,\n \"overlap_size\": config.overlap_size,\n \"preserve_hierarchy\": config.preserve_hierarchy\n }\n },\n \"analysis\": {\n \"ai_use_cases\": result.ai_use_cases,\n \"key_findings\": result.key_findings,\n \"quality_metrics\": result.quality_metrics\n },\n \"chunks\": [\n {\n \"chunk_id\": chunk.chunk_id,\n \"content\": chunk.content,\n \"element_path\": chunk.element_path,\n \"start_line\": chunk.start_line,\n \"end_line\": chunk.end_line,\n \"elements_included\": chunk.elements_included,\n \"metadata\": chunk.metadata,\n \"token_estimate\": chunk.token_estimate\n }\n for chunk in chunks\n ]\n}\n\nwith open(\"analysis_and_chunks.json\", \"w\") as f:\n json.dump(output_data, f, indent=2)\n\nprint(f\"Saved complete analysis with {len(chunks)} chunks to analysis_and_chunks.json\")\n```\n\n## \ud83c\udfaf Key Features\n\n### 1. **\ud83e\udde0 29 Specialized XML Handlers**\n\nAutomatically detects and analyzes different XML document types:\n\n- **Security & Compliance**: SCAP, SAML, SOAP\n- **DevOps & Build**: Maven POM, Ant, Ivy, Spring, Log4j\n- **Content & Documentation**: RSS/Atom, DocBook, XHTML, SVG\n- **Enterprise Systems**: ServiceNow, Hibernate, Struts configurations\n- **Data & APIs**: GPX, KML, GraphML, WADL/WSDL, XML Schemas\n\n### 2. **\u26a1 Intelligent Chunking Strategies**\n\n- **Hierarchical**: Preserves document structure and relationships\n- **Sliding Window**: Fixed-size chunks with configurable overlap\n- **Content-Aware**: Groups related content based on semantic meaning\n- **Auto-Selection**: Automatically chooses best strategy based on document type\n\n### 3. **\ud83e\udd16 AI/ML Ready Output**\n\n- **Token-Optimized**: Chunks sized for LLM context windows\n- **Rich Metadata**: Each chunk includes context, line numbers, and relationships\n- **JSON Export**: Easy integration with vector stores and AI pipelines\n- **Quality Metrics**: Automated assessment of data completeness and structure\n\n### 4. **\ud83d\udd12 Enterprise Security**\n\n- **Safe XML Parsing**: Uses defusedxml to prevent XXE attacks\n- **File Size Limits**: Configurable limits to prevent resource exhaustion\n- **Minimal Dependencies**: Only defusedxml + Python standard library\n\n## \ud83d\udccb Supported Document Types\n\n| Category | Document Types | Common Use Cases |\n| -------- | -------------- | ---------------- |\n| **Security & Compliance** | SCAP, SAML, SOAP | Vulnerability scanning, authentication, web services |\n| **Build & Configuration** | Maven POM, Ant, Spring, Log4j | Dependency management, build automation, app config |\n| **Enterprise Systems** | ServiceNow, Hibernate, Struts | IT service management, ORM mapping, web frameworks |\n| **Content & Media** | RSS/Atom, DocBook, XHTML, SVG | Feeds, documentation, web content, graphics |\n| **Geospatial** | GPX, KML, GraphML | GPS tracking, maps, network graphs |\n| **APIs & Services** | WADL, WSDL, OpenAPI | REST APIs, SOAP services, API documentation |\n| **Data Exchange** | XLIFF, XML Sitemap, Generic XML | Translations, SEO, custom formats |\n\n\n## \ud83d\udd12 Security\n\n### XML Security Protection\n\nThis framework uses **defusedxml** to protect against common XML security vulnerabilities:\n\n- **XXE (XML External Entity) attacks**: Prevents reading local files or making network requests\n- **Billion Laughs attack**: Prevents exponential entity expansion DoS attacks\n- **DTD retrieval**: Blocks external DTD fetching to prevent data exfiltration\n\n#### Security Features\n\n```python\nimport xml_analysis_framework as xaf\n\n# Safe parsing - malicious XML will be rejected automatically\ntry:\n result = xaf.analyze(\"potentially_malicious.xml\")\nexcept Exception as e:\n print(f\"Security threat detected: {e}\")\n\n# The framework automatically protects against:\n# - XXE attacks\n# - Billion laughs / exponential entity expansion\n# - External DTD retrieval\n```\n\n#### Best Practices\n\n1. **Always use the framework's parsers** - Never use `xml.etree.ElementTree` directly\n2. **Validate file sizes** - Set reasonable limits for your use case\n3. **Sanitize file paths** - Ensure input paths are properly validated\n4. **Monitor for security exceptions** - Log and alert on security-blocked parsing attempts\n\n### File Size Limits\n\nThe framework includes built-in file size limits to prevent memory exhaustion:\n\n```python\nimport xml_analysis_framework as xaf\nfrom xml_analysis_framework import XMLDocumentAnalyzer, ChunkingOrchestrator\n\n# Default limits are reasonable for most use cases\n# But you can customize them:\n\n# Create analyzer with custom 50MB limit\nanalyzer = XMLDocumentAnalyzer(max_file_size_mb=50.0)\nresult = analyzer.analyze_document(\"large_file.xml\")\n\n# Create chunking orchestrator with 100MB limit \norchestrator = ChunkingOrchestrator(max_file_size_mb=100.0)\nchunks = orchestrator.chunk_document(\"large_file.xml\", result)\n\n# For simple API, defaults are used automatically\ntry:\n result = xaf.analyze(\"very_large_file.xml\")\nexcept ValueError as e:\n print(f\"File too large: {e}\")\n```\n\n## \ud83d\udd27 Installation\n\n```bash\n# Install from PyPI (recommended)\npip install xml-analysis-framework\n\n# Install from source\ngit clone https://github.com/redhat-ai-americas/xml-analysis-framework.git\ncd xml-analysis-framework\npip install -e .\n\n# Or install development dependencies\npip install -e .[dev]\n```\n\n### Dependencies\n\n- **defusedxml** (0.7.1+): For secure XML parsing protection\n- Python standard library (3.8+) for all other functionality\n\n\n## \ud83e\uddea Testing\n\nThe framework includes comprehensive tests for all handlers and features:\n\n```bash\n# Run all tests\npython -m pytest tests/\n\n# Run specific test categories\npython -m pytest tests/unit/ # Unit tests for handlers\npython -m pytest tests/integration/ # Integration tests\npython -m pytest tests/comprehensive/ # Full system tests\n```\n\n\n## \ud83e\udd16 AI/ML Integration\n\n### AI Processing Pipeline\n\n```\nXML Documents \u2192 Analysis Framework \u2192 Structured Output \u2192 AI/ML Systems\n\n1. Document Analysis (29 specialized handlers)\n2. Smart Chunking (token-optimized)\n3. JSON Export (with metadata)\n4. Integration with:\n - Vector databases (semantic search)\n - LLMs (document Q&A, analysis)\n - Graph databases (relationship mapping)\n - ML pipelines (feature extraction)\n```\n\n### Common AI Use Cases\n\n- **Security Intelligence**: Analyze SCAP reports, detect vulnerabilities, compliance monitoring\n- **DevOps Automation**: Dependency analysis, configuration validation, build optimization \n- **Enterprise Search**: Semantic search across technical documentation and configurations\n- **Knowledge Extraction**: Extract structured data from XML for ML training datasets\n\n\n## \ud83d\ude80 Extending the Framework\n\n### Adding New Handlers\n\n```python\nfrom xml_analysis_framework.base import XMLHandler, SpecializedAnalysis, DocumentTypeInfo\n\nclass CustomHandler(XMLHandler):\n def can_handle_xml(self, root, namespaces):\n # Check if this handler can process the document\n if root.tag == 'custom-format':\n return True, 1.0 # (can_handle, confidence)\n return False, 0.0\n \n def detect_xml_type(self, root, namespaces):\n return DocumentTypeInfo(\n type_name=\"Custom Format\",\n confidence=1.0,\n version=\"1.0\"\n )\n \n def analyze_xml(self, root, file_path):\n return SpecializedAnalysis(\n type_name=\"Custom Format\",\n confidence=1.0,\n key_findings={\"custom_data\": \"value\"},\n ai_use_cases=[\"Custom AI application\"],\n structured_data={\"extracted\": \"data\"},\n file_path=file_path,\n handler_used=\"CustomHandler\"\n )\n \n def extract_xml_key_data(self, root):\n # Extract key data specific to your format\n return {\"key\": \"value\"}\n```\n\n### Custom Chunking Strategies\n\n```python\nfrom xml_analysis_framework.core.chunking import XMLChunkingStrategy, XMLChunk\nimport xml.etree.ElementTree as ET\n\nclass CustomChunking(XMLChunkingStrategy):\n def chunk_document(self, file_path, specialized_analysis=None):\n chunks = []\n tree = ET.parse(file_path)\n root = tree.getroot()\n \n # Custom chunking logic\n for i, element in enumerate(root):\n chunk = XMLChunk(\n chunk_id=f\"custom_{i}\",\n content=ET.tostring(element, encoding='unicode'),\n element_path=f\"/{element.tag}\",\n start_line=1,\n end_line=10,\n parent_context=None,\n metadata={\"custom\": True},\n token_estimate=100,\n elements_included=[element.tag]\n )\n chunks.append(chunk)\n \n return chunks\n\n# Use with the framework\nimport xml_analysis_framework as xaf\nfrom xml_analysis_framework import ChunkingOrchestrator\n\norchestrator = ChunkingOrchestrator()\n# The orchestrator will use your custom strategy when needed\n```\n\n\n## \ud83e\udd1d Contributing\n\nWe welcome contributions! Whether you're adding new XML handlers, improving chunking algorithms, or enhancing AI integrations, your contributions help make XML analysis more accessible and powerful.\n\n**Priority contribution areas:**\n\n- \ud83c\udfaf New XML format handlers (ERP, CRM, healthcare, government)\n- \u26a1 Enhanced chunking algorithms and strategies\n- \ud83d\ude80 Performance optimizations for large files\n- \ud83e\udd16 Advanced AI/ML integration examples\n- \ud83d\udcdd Documentation and usage examples\n\n**\ud83d\udc49 See [CONTRIBUTING.md](CONTRIBUTING.md) for complete guidelines, development setup, and submission process.**\n\n## \ud83d\udcc4 License\n\nThis project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.\n\n## \ud83d\ude4f Acknowledgments\n\n- Designed as part of the **AI Building Blocks** initiative\n- Built for the modern AI/ML ecosystem\n- Community-driven XML format support\n",
"bugtrack_url": null,
"license": "MIT",
"summary": "XML document analysis and preprocessing framework designed for AI/ML data pipelines - part of the unified analysis framework suite",
"version": "2.0.0",
"project_urls": {
"Documentation": "https://github.com/redhat-ai-americas/xml-analysis-framework/blob/main/README.md",
"Homepage": "https://github.com/redhat-ai-americas/xml-analysis-framework",
"Issues": "https://github.com/redhat-ai-americas/xml-analysis-framework/issues",
"Repository": "https://github.com/redhat-ai-americas/xml-analysis-framework"
},
"split_keywords": [
"xml",
" analysis",
" ai",
" ml",
" document-processing",
" semantic-search"
],
"urls": [
{
"comment_text": null,
"digests": {
"blake2b_256": "44da7c18257dc0e6d6ce640ddcfc379ed6b8d68aab3c4136e590d9203263cdc0",
"md5": "791f7cb64375c448dd88644c6d6ccb27",
"sha256": "c206150a7130160ff950fa8f11f0a40bf33f34b8bd7736211d5c7236bc7ef269"
},
"downloads": -1,
"filename": "xml_analysis_framework-2.0.0-py3-none-any.whl",
"has_sig": false,
"md5_digest": "791f7cb64375c448dd88644c6d6ccb27",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">=3.8",
"size": 207792,
"upload_time": "2025-10-28T01:49:55",
"upload_time_iso_8601": "2025-10-28T01:49:55.671555Z",
"url": "https://files.pythonhosted.org/packages/44/da/7c18257dc0e6d6ce640ddcfc379ed6b8d68aab3c4136e590d9203263cdc0/xml_analysis_framework-2.0.0-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": null,
"digests": {
"blake2b_256": "91f8d87bec1702a9e5aa6992567f4fa5f6630b6221070f93a25a031f9ee64ee3",
"md5": "7a83ecde573afeae818acd87db3ef64f",
"sha256": "dc7a6c6db5b9933db31246cb882566e8b73fa9287f032022028193baf90f3367"
},
"downloads": -1,
"filename": "xml_analysis_framework-2.0.0.tar.gz",
"has_sig": false,
"md5_digest": "7a83ecde573afeae818acd87db3ef64f",
"packagetype": "sdist",
"python_version": "source",
"requires_python": ">=3.8",
"size": 238324,
"upload_time": "2025-10-28T01:49:57",
"upload_time_iso_8601": "2025-10-28T01:49:57.172358Z",
"url": "https://files.pythonhosted.org/packages/91/f8/d87bec1702a9e5aa6992567f4fa5f6630b6221070f93a25a031f9ee64ee3/xml_analysis_framework-2.0.0.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2025-10-28 01:49:57",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "redhat-ai-americas",
"github_project": "xml-analysis-framework",
"travis_ci": false,
"coveralls": false,
"github_actions": true,
"requirements": [
{
"name": "defusedxml",
"specs": [
[
">=",
"0.7.1"
]
]
}
],
"lcname": "xml-analysis-framework"
}