agentChef


NameagentChef JSON
Version 0.1.5 PyPI version JSON
download
home_pagehttps://github.com/leoleojames1/agentChef
SummaryA tool for collecting, processing, and generating datasets for AI training
upload_time2024-09-25 11:42:03
maintainerNone
docs_urlNone
authorLeo Borcherding
requires_python>=3.6
licenseNone
keywords
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI No Travis.
coveralls test coverage No coveralls.
            # AgentChef Comprehensive Guide

This guide demonstrates how to use the agentChef package to scrape data from various sources, process it into a structured format, and run it through a data processing pipeline.

## Installation

First, install the agentChef package:

```bash
pip install agentChef
```

## Initialization

Import the necessary modules and initialize the DatasetKitchen:

```python
from agentChef import DatasetKitchen
from agentChef.cutlery import CustomAgentBase
import pandas as pd

config = {
    'templates_dir': './templates',
    'input_dir': './input',
    'output_dir': './output',
}

kitchen = DatasetKitchen(config)
```

## Data Collection

### 1. Hugging Face Datasets

```python
hf_dataset_url = "https://huggingface.co/datasets/your_dataset"
hf_data = kitchen.document_loader.load_from_huggingface(hf_dataset_url)
```

### 2. Wikipedia

```python
wiki_query = "Artificial Intelligence"
wiki_data = kitchen.document_loader.load_from_wikipedia(wiki_query)
```

### 3. Reddit (Custom Agent)

Create a custom Reddit agent:

```python
import praw

class RedditAgent(CustomAgentBase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.reddit = praw.Reddit(client_id='YOUR_CLIENT_ID',
                                  client_secret='YOUR_CLIENT_SECRET',
                                  user_agent='YOUR_USER_AGENT')

    def scrape_data(self, source: str) -> List[Dict[str, Any]]:
        submission = self.reddit.submission(url=source)
        submission.comments.replace_more(limit=None)
        return [{'body': comment.body, 'score': comment.score} for comment in submission.comments.list()]

    def process_data(self, data: List[Dict[str, Any]]) -> pd.DataFrame:
        return pd.DataFrame(data)

kitchen.register_custom_agent('reddit', RedditAgent)
reddit_agent = kitchen.get_agent('reddit')
reddit_data = reddit_agent.run('https://www.reddit.com/r/AskReddit/comments/example_post/')
```

### 4. arXiv

```python
arxiv_query = "machine learning"
arxiv_data = kitchen.document_loader.load_from_arxiv(arxiv_query)
```

### 5. GitHub

```python
github_repo = "username/repo"
github_data = kitchen.document_loader.load_from_github(github_repo)
```

## Data Processing

Now that we have collected data from various sources, let's process it into the desired JSON structure:

```python
def process_data(data, source_type):
    processed_data = []
    for item in data:
        processed_item = {
            "task": "command_description",
            "instruction": "You are a function description specialist for Ollama Agent Roll Cage.",
            "input": f"Please explain what the command does in the context of {source_type}.",
            "output": item.get('body', item.get('content', str(item))),
            "command": f"/{source_type}_command"
        }
        processed_data.append(processed_item)
    return processed_data

# Process data from each source
hf_processed = process_data(hf_data, "huggingface")
wiki_processed = process_data(wiki_data, "wikipedia")
reddit_processed = process_data(reddit_data.to_dict('records'), "reddit")
arxiv_processed = process_data(arxiv_data, "arxiv")
github_processed = process_data(github_data, "github")

# Combine all processed data
all_processed_data = hf_processed + wiki_processed + reddit_processed + arxiv_processed + github_processed
```

## Data Pipeline

Now, let's run the combined data through the agentChef pipeline:

```python
# Convert to DataFrame
df = pd.DataFrame(all_processed_data)

# Clean data
cleaned_data = kitchen.clean_data(df)

# Augment data
augmentation_config = {
    'instruction': {'type': 'paraphrase'},
    'input': {'type': 'paraphrase'},
    'output': {'type': 'paraphrase'}
}
augmented_data = kitchen.augment_data(cleaned_data, augmentation_config)

# Generate synthetic data
num_samples = 100
synthetic_data = kitchen.generate_synthetic_data(augmented_data, num_samples, augmentation_config)

# Final cleaning
final_data = kitchen.clean_data(synthetic_data)
```

## Saving and Pushing to Hugging Face

Save the final data as JSON and Parquet:

```python
# Save as JSON
json_output = 'final_dataset.json'
final_data.to_json(json_output, orient='records', indent=2)
print(f"Saved JSON to {json_output}")

# Save as Parquet
parquet_output = 'final_dataset.parquet'
kitchen.file_handler.save_to_parquet(final_data, parquet_output)
print(f"Saved Parquet to {parquet_output}")
```

Optionally, push the Parquet file to Hugging Face:

```python
from huggingface_hub import HfApi

def push_to_huggingface(file_path, repo_id, token):
    api = HfApi()
    api.upload_file(
        path_or_fileobj=file_path,
        path_in_repo=file_path,
        repo_id=repo_id,
        token=token
    )

# Push to Hugging Face
hf_token = "your_huggingface_token"
hf_repo = "your_username/your_dataset_repo"
push_to_huggingface(parquet_output, hf_repo, hf_token)
print(f"Pushed {parquet_output} to Hugging Face repository: {hf_repo}")
```

## Complete Example

Here's a complete script that puts all these steps together:

```python
from agentChef import DatasetKitchen
from agentChef.cutlery import CustomAgentBase
import pandas as pd
import praw
from huggingface_hub import HfApi

# Configuration
config = {
    'templates_dir': './templates',
    'input_dir': './input',
    'output_dir': './output',
    'github_access_token': 'your_github_token_if_needed'
}

# Initialize DatasetKitchen
kitchen = DatasetKitchen(config)

# Define Reddit Agent
class RedditAgent(CustomAgentBase):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.reddit = praw.Reddit(client_id='YOUR_CLIENT_ID',
                                  client_secret='YOUR_CLIENT_SECRET',
                                  user_agent='YOUR_USER_AGENT')

    def scrape_data(self, source: str) -> List[Dict[str, Any]]:
        submission = self.reddit.submission(url=source)
        submission.comments.replace_more(limit=None)
        return [{'body': comment.body, 'score': comment.score} for comment in submission.comments.list()]

    def process_data(self, data: List[Dict[str, Any]]) -> pd.DataFrame:
        return pd.DataFrame(data)

# Register Reddit Agent
kitchen.register_custom_agent('reddit', RedditAgent)

# Data Collection
hf_data = kitchen.document_loader.load_from_huggingface("https://huggingface.co/datasets/your_dataset")
wiki_data = kitchen.document_loader.load_from_wikipedia("Artificial Intelligence")
reddit_agent = kitchen.get_agent('reddit')
reddit_data = reddit_agent.run('https://www.reddit.com/r/AskReddit/comments/example_post/')
arxiv_data = kitchen.document_loader.load_from_arxiv("machine learning")
github_data = kitchen.document_loader.load_from_github("username/repo")

# Data Processing
def process_data(data, source_type):
    processed_data = []
    for item in data:
        processed_item = {
            "task": "command_description",
            "instruction": "You are a function description specialist for Ollama Agent Roll Cage.",
            "input": f"Please explain what the command does in the context of {source_type}.",
            "output": item.get('body', item.get('content', str(item))),
            "command": f"/{source_type}_command"
        }
        processed_data.append(processed_item)
    return processed_data

hf_processed = process_data(hf_data, "huggingface")
wiki_processed = process_data(wiki_data, "wikipedia")
reddit_processed = process_data(reddit_data.to_dict('records'), "reddit")
arxiv_processed = process_data(arxiv_data, "arxiv")
github_processed = process_data(github_data, "github")

all_processed_data = hf_processed + wiki_processed + reddit_processed + arxiv_processed + github_processed

# Data Pipeline
df = pd.DataFrame(all_processed_data)
cleaned_data = kitchen.clean_data(df)

augmentation_config = {
    'instruction': {'type': 'paraphrase'},
    'input': {'type': 'paraphrase'},
    'output': {'type': 'paraphrase'}
}
augmented_data = kitchen.augment_data(cleaned_data, augmentation_config)

num_samples = 100
synthetic_data = kitchen.generate_synthetic_data(augmented_data, num_samples, augmentation_config)

final_data = kitchen.clean_data(synthetic_data)

# Save and Push to Hugging Face
json_output = 'final_dataset.json'
final_data.to_json(json_output, orient='records', indent=2)
print(f"Saved JSON to {json_output}")

parquet_output = 'final_dataset.parquet'
kitchen.file_handler.save_to_parquet(final_data, parquet_output)
print(f"Saved Parquet to {parquet_output}")

def push_to_huggingface(file_path, repo_id, token):
    api = HfApi()
    api.upload_file(
        path_or_fileobj=file_path,
        path_in_repo=file_path,
        repo_id=repo_id,
        token=token
    )

hf_token = "your_huggingface_token"
hf_repo = "your_username/your_dataset_repo"
push_to_huggingface(parquet_output, hf_repo, hf_token)
print(f"Pushed {parquet_output} to Hugging Face repository: {hf_repo}")
```

This comprehensive guide demonstrates how to use the agentChef package to collect data from various sources, process it into a structured format, run it through a data processing pipeline, and finally save and push the results to Hugging Face. You can adapt this guide to your specific needs and data sources.

            

Raw data

            {
    "_id": null,
    "home_page": "https://github.com/leoleojames1/agentChef",
    "name": "agentChef",
    "maintainer": null,
    "docs_url": null,
    "requires_python": ">=3.6",
    "maintainer_email": null,
    "keywords": null,
    "author": "Leo Borcherding",
    "author_email": "borchink@gmail.com",
    "download_url": "https://files.pythonhosted.org/packages/31/70/c2cb5e139b5e12810578fe99b73aab47bbacbe1263c307cfe820ffb67b4d/agentchef-0.1.5.tar.gz",
    "platform": null,
    "description": "# AgentChef Comprehensive Guide\r\n\r\nThis guide demonstrates how to use the agentChef package to scrape data from various sources, process it into a structured format, and run it through a data processing pipeline.\r\n\r\n## Installation\r\n\r\nFirst, install the agentChef package:\r\n\r\n```bash\r\npip install agentChef\r\n```\r\n\r\n## Initialization\r\n\r\nImport the necessary modules and initialize the DatasetKitchen:\r\n\r\n```python\r\nfrom agentChef import DatasetKitchen\r\nfrom agentChef.cutlery import CustomAgentBase\r\nimport pandas as pd\r\n\r\nconfig = {\r\n    'templates_dir': './templates',\r\n    'input_dir': './input',\r\n    'output_dir': './output',\r\n}\r\n\r\nkitchen = DatasetKitchen(config)\r\n```\r\n\r\n## Data Collection\r\n\r\n### 1. Hugging Face Datasets\r\n\r\n```python\r\nhf_dataset_url = \"https://huggingface.co/datasets/your_dataset\"\r\nhf_data = kitchen.document_loader.load_from_huggingface(hf_dataset_url)\r\n```\r\n\r\n### 2. Wikipedia\r\n\r\n```python\r\nwiki_query = \"Artificial Intelligence\"\r\nwiki_data = kitchen.document_loader.load_from_wikipedia(wiki_query)\r\n```\r\n\r\n### 3. Reddit (Custom Agent)\r\n\r\nCreate a custom Reddit agent:\r\n\r\n```python\r\nimport praw\r\n\r\nclass RedditAgent(CustomAgentBase):\r\n    def __init__(self, *args, **kwargs):\r\n        super().__init__(*args, **kwargs)\r\n        self.reddit = praw.Reddit(client_id='YOUR_CLIENT_ID',\r\n                                  client_secret='YOUR_CLIENT_SECRET',\r\n                                  user_agent='YOUR_USER_AGENT')\r\n\r\n    def scrape_data(self, source: str) -> List[Dict[str, Any]]:\r\n        submission = self.reddit.submission(url=source)\r\n        submission.comments.replace_more(limit=None)\r\n        return [{'body': comment.body, 'score': comment.score} for comment in submission.comments.list()]\r\n\r\n    def process_data(self, data: List[Dict[str, Any]]) -> pd.DataFrame:\r\n        return pd.DataFrame(data)\r\n\r\nkitchen.register_custom_agent('reddit', RedditAgent)\r\nreddit_agent = kitchen.get_agent('reddit')\r\nreddit_data = reddit_agent.run('https://www.reddit.com/r/AskReddit/comments/example_post/')\r\n```\r\n\r\n### 4. arXiv\r\n\r\n```python\r\narxiv_query = \"machine learning\"\r\narxiv_data = kitchen.document_loader.load_from_arxiv(arxiv_query)\r\n```\r\n\r\n### 5. GitHub\r\n\r\n```python\r\ngithub_repo = \"username/repo\"\r\ngithub_data = kitchen.document_loader.load_from_github(github_repo)\r\n```\r\n\r\n## Data Processing\r\n\r\nNow that we have collected data from various sources, let's process it into the desired JSON structure:\r\n\r\n```python\r\ndef process_data(data, source_type):\r\n    processed_data = []\r\n    for item in data:\r\n        processed_item = {\r\n            \"task\": \"command_description\",\r\n            \"instruction\": \"You are a function description specialist for Ollama Agent Roll Cage.\",\r\n            \"input\": f\"Please explain what the command does in the context of {source_type}.\",\r\n            \"output\": item.get('body', item.get('content', str(item))),\r\n            \"command\": f\"/{source_type}_command\"\r\n        }\r\n        processed_data.append(processed_item)\r\n    return processed_data\r\n\r\n# Process data from each source\r\nhf_processed = process_data(hf_data, \"huggingface\")\r\nwiki_processed = process_data(wiki_data, \"wikipedia\")\r\nreddit_processed = process_data(reddit_data.to_dict('records'), \"reddit\")\r\narxiv_processed = process_data(arxiv_data, \"arxiv\")\r\ngithub_processed = process_data(github_data, \"github\")\r\n\r\n# Combine all processed data\r\nall_processed_data = hf_processed + wiki_processed + reddit_processed + arxiv_processed + github_processed\r\n```\r\n\r\n## Data Pipeline\r\n\r\nNow, let's run the combined data through the agentChef pipeline:\r\n\r\n```python\r\n# Convert to DataFrame\r\ndf = pd.DataFrame(all_processed_data)\r\n\r\n# Clean data\r\ncleaned_data = kitchen.clean_data(df)\r\n\r\n# Augment data\r\naugmentation_config = {\r\n    'instruction': {'type': 'paraphrase'},\r\n    'input': {'type': 'paraphrase'},\r\n    'output': {'type': 'paraphrase'}\r\n}\r\naugmented_data = kitchen.augment_data(cleaned_data, augmentation_config)\r\n\r\n# Generate synthetic data\r\nnum_samples = 100\r\nsynthetic_data = kitchen.generate_synthetic_data(augmented_data, num_samples, augmentation_config)\r\n\r\n# Final cleaning\r\nfinal_data = kitchen.clean_data(synthetic_data)\r\n```\r\n\r\n## Saving and Pushing to Hugging Face\r\n\r\nSave the final data as JSON and Parquet:\r\n\r\n```python\r\n# Save as JSON\r\njson_output = 'final_dataset.json'\r\nfinal_data.to_json(json_output, orient='records', indent=2)\r\nprint(f\"Saved JSON to {json_output}\")\r\n\r\n# Save as Parquet\r\nparquet_output = 'final_dataset.parquet'\r\nkitchen.file_handler.save_to_parquet(final_data, parquet_output)\r\nprint(f\"Saved Parquet to {parquet_output}\")\r\n```\r\n\r\nOptionally, push the Parquet file to Hugging Face:\r\n\r\n```python\r\nfrom huggingface_hub import HfApi\r\n\r\ndef push_to_huggingface(file_path, repo_id, token):\r\n    api = HfApi()\r\n    api.upload_file(\r\n        path_or_fileobj=file_path,\r\n        path_in_repo=file_path,\r\n        repo_id=repo_id,\r\n        token=token\r\n    )\r\n\r\n# Push to Hugging Face\r\nhf_token = \"your_huggingface_token\"\r\nhf_repo = \"your_username/your_dataset_repo\"\r\npush_to_huggingface(parquet_output, hf_repo, hf_token)\r\nprint(f\"Pushed {parquet_output} to Hugging Face repository: {hf_repo}\")\r\n```\r\n\r\n## Complete Example\r\n\r\nHere's a complete script that puts all these steps together:\r\n\r\n```python\r\nfrom agentChef import DatasetKitchen\r\nfrom agentChef.cutlery import CustomAgentBase\r\nimport pandas as pd\r\nimport praw\r\nfrom huggingface_hub import HfApi\r\n\r\n# Configuration\r\nconfig = {\r\n    'templates_dir': './templates',\r\n    'input_dir': './input',\r\n    'output_dir': './output',\r\n    'github_access_token': 'your_github_token_if_needed'\r\n}\r\n\r\n# Initialize DatasetKitchen\r\nkitchen = DatasetKitchen(config)\r\n\r\n# Define Reddit Agent\r\nclass RedditAgent(CustomAgentBase):\r\n    def __init__(self, *args, **kwargs):\r\n        super().__init__(*args, **kwargs)\r\n        self.reddit = praw.Reddit(client_id='YOUR_CLIENT_ID',\r\n                                  client_secret='YOUR_CLIENT_SECRET',\r\n                                  user_agent='YOUR_USER_AGENT')\r\n\r\n    def scrape_data(self, source: str) -> List[Dict[str, Any]]:\r\n        submission = self.reddit.submission(url=source)\r\n        submission.comments.replace_more(limit=None)\r\n        return [{'body': comment.body, 'score': comment.score} for comment in submission.comments.list()]\r\n\r\n    def process_data(self, data: List[Dict[str, Any]]) -> pd.DataFrame:\r\n        return pd.DataFrame(data)\r\n\r\n# Register Reddit Agent\r\nkitchen.register_custom_agent('reddit', RedditAgent)\r\n\r\n# Data Collection\r\nhf_data = kitchen.document_loader.load_from_huggingface(\"https://huggingface.co/datasets/your_dataset\")\r\nwiki_data = kitchen.document_loader.load_from_wikipedia(\"Artificial Intelligence\")\r\nreddit_agent = kitchen.get_agent('reddit')\r\nreddit_data = reddit_agent.run('https://www.reddit.com/r/AskReddit/comments/example_post/')\r\narxiv_data = kitchen.document_loader.load_from_arxiv(\"machine learning\")\r\ngithub_data = kitchen.document_loader.load_from_github(\"username/repo\")\r\n\r\n# Data Processing\r\ndef process_data(data, source_type):\r\n    processed_data = []\r\n    for item in data:\r\n        processed_item = {\r\n            \"task\": \"command_description\",\r\n            \"instruction\": \"You are a function description specialist for Ollama Agent Roll Cage.\",\r\n            \"input\": f\"Please explain what the command does in the context of {source_type}.\",\r\n            \"output\": item.get('body', item.get('content', str(item))),\r\n            \"command\": f\"/{source_type}_command\"\r\n        }\r\n        processed_data.append(processed_item)\r\n    return processed_data\r\n\r\nhf_processed = process_data(hf_data, \"huggingface\")\r\nwiki_processed = process_data(wiki_data, \"wikipedia\")\r\nreddit_processed = process_data(reddit_data.to_dict('records'), \"reddit\")\r\narxiv_processed = process_data(arxiv_data, \"arxiv\")\r\ngithub_processed = process_data(github_data, \"github\")\r\n\r\nall_processed_data = hf_processed + wiki_processed + reddit_processed + arxiv_processed + github_processed\r\n\r\n# Data Pipeline\r\ndf = pd.DataFrame(all_processed_data)\r\ncleaned_data = kitchen.clean_data(df)\r\n\r\naugmentation_config = {\r\n    'instruction': {'type': 'paraphrase'},\r\n    'input': {'type': 'paraphrase'},\r\n    'output': {'type': 'paraphrase'}\r\n}\r\naugmented_data = kitchen.augment_data(cleaned_data, augmentation_config)\r\n\r\nnum_samples = 100\r\nsynthetic_data = kitchen.generate_synthetic_data(augmented_data, num_samples, augmentation_config)\r\n\r\nfinal_data = kitchen.clean_data(synthetic_data)\r\n\r\n# Save and Push to Hugging Face\r\njson_output = 'final_dataset.json'\r\nfinal_data.to_json(json_output, orient='records', indent=2)\r\nprint(f\"Saved JSON to {json_output}\")\r\n\r\nparquet_output = 'final_dataset.parquet'\r\nkitchen.file_handler.save_to_parquet(final_data, parquet_output)\r\nprint(f\"Saved Parquet to {parquet_output}\")\r\n\r\ndef push_to_huggingface(file_path, repo_id, token):\r\n    api = HfApi()\r\n    api.upload_file(\r\n        path_or_fileobj=file_path,\r\n        path_in_repo=file_path,\r\n        repo_id=repo_id,\r\n        token=token\r\n    )\r\n\r\nhf_token = \"your_huggingface_token\"\r\nhf_repo = \"your_username/your_dataset_repo\"\r\npush_to_huggingface(parquet_output, hf_repo, hf_token)\r\nprint(f\"Pushed {parquet_output} to Hugging Face repository: {hf_repo}\")\r\n```\r\n\r\nThis comprehensive guide demonstrates how to use the agentChef package to collect data from various sources, process it into a structured format, run it through a data processing pipeline, and finally save and push the results to Hugging Face. You can adapt this guide to your specific needs and data sources.\r\n",
    "bugtrack_url": null,
    "license": null,
    "summary": "A tool for collecting, processing, and generating datasets for AI training",
    "version": "0.1.5",
    "project_urls": {
        "Homepage": "https://github.com/leoleojames1/agentChef"
    },
    "split_keywords": [],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "c982b2367ddaa30919d949543979231b59b7a41f31fdc7cb3611f27444d5236c",
                "md5": "bd227addacbaccee45bc4d2f6ee40da2",
                "sha256": "18b6166f43f23a8d7ce79cdad059e5c4be56f7c3cbcfb60669a4cee21e960912"
            },
            "downloads": -1,
            "filename": "agentChef-0.1.5-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "bd227addacbaccee45bc4d2f6ee40da2",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": ">=3.6",
            "size": 3701,
            "upload_time": "2024-09-25T11:42:02",
            "upload_time_iso_8601": "2024-09-25T11:42:02.643600Z",
            "url": "https://files.pythonhosted.org/packages/c9/82/b2367ddaa30919d949543979231b59b7a41f31fdc7cb3611f27444d5236c/agentChef-0.1.5-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "3170c2cb5e139b5e12810578fe99b73aab47bbacbe1263c307cfe820ffb67b4d",
                "md5": "57462a4a8643a7f57b42f8056fb423b0",
                "sha256": "4568812723ee172661710affa8f1f4f35228ba3ca5d7f3b1df11ecc24a2c93d5"
            },
            "downloads": -1,
            "filename": "agentchef-0.1.5.tar.gz",
            "has_sig": false,
            "md5_digest": "57462a4a8643a7f57b42f8056fb423b0",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": ">=3.6",
            "size": 3905,
            "upload_time": "2024-09-25T11:42:03",
            "upload_time_iso_8601": "2024-09-25T11:42:03.705379Z",
            "url": "https://files.pythonhosted.org/packages/31/70/c2cb5e139b5e12810578fe99b73aab47bbacbe1263c307cfe820ffb67b4d/agentchef-0.1.5.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2024-09-25 11:42:03",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "leoleojames1",
    "github_project": "agentChef",
    "travis_ci": false,
    "coveralls": false,
    "github_actions": false,
    "lcname": "agentchef"
}
        
Elapsed time: 0.34052s