# AgentChef Comprehensive Guide
This guide demonstrates how to use the agentChef package to scrape data from various sources, process it into a structured format, and run it through a data processing pipeline.
## Installation
First, install the agentChef package:
```bash
pip install agentChef
```
## Initialization
Import the necessary modules and initialize the DatasetKitchen:
```python
from agentChef import DatasetKitchen
from agentChef.cutlery import CustomAgentBase
import pandas as pd
config = {
'templates_dir': './templates',
'input_dir': './input',
'output_dir': './output',
}
kitchen = DatasetKitchen(config)
```
## Data Collection
### 1. Hugging Face Datasets
```python
hf_dataset_url = "https://huggingface.co/datasets/your_dataset"
hf_data = kitchen.document_loader.load_from_huggingface(hf_dataset_url)
```
### 2. Wikipedia
```python
wiki_query = "Artificial Intelligence"
wiki_data = kitchen.document_loader.load_from_wikipedia(wiki_query)
```
### 3. Reddit (Custom Agent)
Create a custom Reddit agent:
```python
import praw
class RedditAgent(CustomAgentBase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.reddit = praw.Reddit(client_id='YOUR_CLIENT_ID',
client_secret='YOUR_CLIENT_SECRET',
user_agent='YOUR_USER_AGENT')
def scrape_data(self, source: str) -> List[Dict[str, Any]]:
submission = self.reddit.submission(url=source)
submission.comments.replace_more(limit=None)
return [{'body': comment.body, 'score': comment.score} for comment in submission.comments.list()]
def process_data(self, data: List[Dict[str, Any]]) -> pd.DataFrame:
return pd.DataFrame(data)
kitchen.register_custom_agent('reddit', RedditAgent)
reddit_agent = kitchen.get_agent('reddit')
reddit_data = reddit_agent.run('https://www.reddit.com/r/AskReddit/comments/example_post/')
```
### 4. arXiv
```python
arxiv_query = "machine learning"
arxiv_data = kitchen.document_loader.load_from_arxiv(arxiv_query)
```
### 5. GitHub
```python
github_repo = "username/repo"
github_data = kitchen.document_loader.load_from_github(github_repo)
```
## Data Processing
Now that we have collected data from various sources, let's process it into the desired JSON structure:
```python
def process_data(data, source_type):
processed_data = []
for item in data:
processed_item = {
"task": "command_description",
"instruction": "You are a function description specialist for Ollama Agent Roll Cage.",
"input": f"Please explain what the command does in the context of {source_type}.",
"output": item.get('body', item.get('content', str(item))),
"command": f"/{source_type}_command"
}
processed_data.append(processed_item)
return processed_data
# Process data from each source
hf_processed = process_data(hf_data, "huggingface")
wiki_processed = process_data(wiki_data, "wikipedia")
reddit_processed = process_data(reddit_data.to_dict('records'), "reddit")
arxiv_processed = process_data(arxiv_data, "arxiv")
github_processed = process_data(github_data, "github")
# Combine all processed data
all_processed_data = hf_processed + wiki_processed + reddit_processed + arxiv_processed + github_processed
```
## Data Pipeline
Now, let's run the combined data through the agentChef pipeline:
```python
# Convert to DataFrame
df = pd.DataFrame(all_processed_data)
# Clean data
cleaned_data = kitchen.clean_data(df)
# Augment data
augmentation_config = {
'instruction': {'type': 'paraphrase'},
'input': {'type': 'paraphrase'},
'output': {'type': 'paraphrase'}
}
augmented_data = kitchen.augment_data(cleaned_data, augmentation_config)
# Generate synthetic data
num_samples = 100
synthetic_data = kitchen.generate_synthetic_data(augmented_data, num_samples, augmentation_config)
# Final cleaning
final_data = kitchen.clean_data(synthetic_data)
```
## Saving and Pushing to Hugging Face
Save the final data as JSON and Parquet:
```python
# Save as JSON
json_output = 'final_dataset.json'
final_data.to_json(json_output, orient='records', indent=2)
print(f"Saved JSON to {json_output}")
# Save as Parquet
parquet_output = 'final_dataset.parquet'
kitchen.file_handler.save_to_parquet(final_data, parquet_output)
print(f"Saved Parquet to {parquet_output}")
```
Optionally, push the Parquet file to Hugging Face:
```python
from huggingface_hub import HfApi
def push_to_huggingface(file_path, repo_id, token):
api = HfApi()
api.upload_file(
path_or_fileobj=file_path,
path_in_repo=file_path,
repo_id=repo_id,
token=token
)
# Push to Hugging Face
hf_token = "your_huggingface_token"
hf_repo = "your_username/your_dataset_repo"
push_to_huggingface(parquet_output, hf_repo, hf_token)
print(f"Pushed {parquet_output} to Hugging Face repository: {hf_repo}")
```
## Complete Example
Here's a complete script that puts all these steps together:
```python
from agentChef import DatasetKitchen
from agentChef.cutlery import CustomAgentBase
import pandas as pd
import praw
from huggingface_hub import HfApi
# Configuration
config = {
'templates_dir': './templates',
'input_dir': './input',
'output_dir': './output',
'github_access_token': 'your_github_token_if_needed'
}
# Initialize DatasetKitchen
kitchen = DatasetKitchen(config)
# Define Reddit Agent
class RedditAgent(CustomAgentBase):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.reddit = praw.Reddit(client_id='YOUR_CLIENT_ID',
client_secret='YOUR_CLIENT_SECRET',
user_agent='YOUR_USER_AGENT')
def scrape_data(self, source: str) -> List[Dict[str, Any]]:
submission = self.reddit.submission(url=source)
submission.comments.replace_more(limit=None)
return [{'body': comment.body, 'score': comment.score} for comment in submission.comments.list()]
def process_data(self, data: List[Dict[str, Any]]) -> pd.DataFrame:
return pd.DataFrame(data)
# Register Reddit Agent
kitchen.register_custom_agent('reddit', RedditAgent)
# Data Collection
hf_data = kitchen.document_loader.load_from_huggingface("https://huggingface.co/datasets/your_dataset")
wiki_data = kitchen.document_loader.load_from_wikipedia("Artificial Intelligence")
reddit_agent = kitchen.get_agent('reddit')
reddit_data = reddit_agent.run('https://www.reddit.com/r/AskReddit/comments/example_post/')
arxiv_data = kitchen.document_loader.load_from_arxiv("machine learning")
github_data = kitchen.document_loader.load_from_github("username/repo")
# Data Processing
def process_data(data, source_type):
processed_data = []
for item in data:
processed_item = {
"task": "command_description",
"instruction": "You are a function description specialist for Ollama Agent Roll Cage.",
"input": f"Please explain what the command does in the context of {source_type}.",
"output": item.get('body', item.get('content', str(item))),
"command": f"/{source_type}_command"
}
processed_data.append(processed_item)
return processed_data
hf_processed = process_data(hf_data, "huggingface")
wiki_processed = process_data(wiki_data, "wikipedia")
reddit_processed = process_data(reddit_data.to_dict('records'), "reddit")
arxiv_processed = process_data(arxiv_data, "arxiv")
github_processed = process_data(github_data, "github")
all_processed_data = hf_processed + wiki_processed + reddit_processed + arxiv_processed + github_processed
# Data Pipeline
df = pd.DataFrame(all_processed_data)
cleaned_data = kitchen.clean_data(df)
augmentation_config = {
'instruction': {'type': 'paraphrase'},
'input': {'type': 'paraphrase'},
'output': {'type': 'paraphrase'}
}
augmented_data = kitchen.augment_data(cleaned_data, augmentation_config)
num_samples = 100
synthetic_data = kitchen.generate_synthetic_data(augmented_data, num_samples, augmentation_config)
final_data = kitchen.clean_data(synthetic_data)
# Save and Push to Hugging Face
json_output = 'final_dataset.json'
final_data.to_json(json_output, orient='records', indent=2)
print(f"Saved JSON to {json_output}")
parquet_output = 'final_dataset.parquet'
kitchen.file_handler.save_to_parquet(final_data, parquet_output)
print(f"Saved Parquet to {parquet_output}")
def push_to_huggingface(file_path, repo_id, token):
api = HfApi()
api.upload_file(
path_or_fileobj=file_path,
path_in_repo=file_path,
repo_id=repo_id,
token=token
)
hf_token = "your_huggingface_token"
hf_repo = "your_username/your_dataset_repo"
push_to_huggingface(parquet_output, hf_repo, hf_token)
print(f"Pushed {parquet_output} to Hugging Face repository: {hf_repo}")
```
This comprehensive guide demonstrates how to use the agentChef package to collect data from various sources, process it into a structured format, run it through a data processing pipeline, and finally save and push the results to Hugging Face. You can adapt this guide to your specific needs and data sources.
Raw data
{
"_id": null,
"home_page": "https://github.com/leoleojames1/agentChef",
"name": "agentChef",
"maintainer": null,
"docs_url": null,
"requires_python": ">=3.6",
"maintainer_email": null,
"keywords": null,
"author": "Leo Borcherding",
"author_email": "borchink@gmail.com",
"download_url": "https://files.pythonhosted.org/packages/31/70/c2cb5e139b5e12810578fe99b73aab47bbacbe1263c307cfe820ffb67b4d/agentchef-0.1.5.tar.gz",
"platform": null,
"description": "# AgentChef Comprehensive Guide\r\n\r\nThis guide demonstrates how to use the agentChef package to scrape data from various sources, process it into a structured format, and run it through a data processing pipeline.\r\n\r\n## Installation\r\n\r\nFirst, install the agentChef package:\r\n\r\n```bash\r\npip install agentChef\r\n```\r\n\r\n## Initialization\r\n\r\nImport the necessary modules and initialize the DatasetKitchen:\r\n\r\n```python\r\nfrom agentChef import DatasetKitchen\r\nfrom agentChef.cutlery import CustomAgentBase\r\nimport pandas as pd\r\n\r\nconfig = {\r\n 'templates_dir': './templates',\r\n 'input_dir': './input',\r\n 'output_dir': './output',\r\n}\r\n\r\nkitchen = DatasetKitchen(config)\r\n```\r\n\r\n## Data Collection\r\n\r\n### 1. Hugging Face Datasets\r\n\r\n```python\r\nhf_dataset_url = \"https://huggingface.co/datasets/your_dataset\"\r\nhf_data = kitchen.document_loader.load_from_huggingface(hf_dataset_url)\r\n```\r\n\r\n### 2. Wikipedia\r\n\r\n```python\r\nwiki_query = \"Artificial Intelligence\"\r\nwiki_data = kitchen.document_loader.load_from_wikipedia(wiki_query)\r\n```\r\n\r\n### 3. Reddit (Custom Agent)\r\n\r\nCreate a custom Reddit agent:\r\n\r\n```python\r\nimport praw\r\n\r\nclass RedditAgent(CustomAgentBase):\r\n def __init__(self, *args, **kwargs):\r\n super().__init__(*args, **kwargs)\r\n self.reddit = praw.Reddit(client_id='YOUR_CLIENT_ID',\r\n client_secret='YOUR_CLIENT_SECRET',\r\n user_agent='YOUR_USER_AGENT')\r\n\r\n def scrape_data(self, source: str) -> List[Dict[str, Any]]:\r\n submission = self.reddit.submission(url=source)\r\n submission.comments.replace_more(limit=None)\r\n return [{'body': comment.body, 'score': comment.score} for comment in submission.comments.list()]\r\n\r\n def process_data(self, data: List[Dict[str, Any]]) -> pd.DataFrame:\r\n return pd.DataFrame(data)\r\n\r\nkitchen.register_custom_agent('reddit', RedditAgent)\r\nreddit_agent = kitchen.get_agent('reddit')\r\nreddit_data = reddit_agent.run('https://www.reddit.com/r/AskReddit/comments/example_post/')\r\n```\r\n\r\n### 4. arXiv\r\n\r\n```python\r\narxiv_query = \"machine learning\"\r\narxiv_data = kitchen.document_loader.load_from_arxiv(arxiv_query)\r\n```\r\n\r\n### 5. GitHub\r\n\r\n```python\r\ngithub_repo = \"username/repo\"\r\ngithub_data = kitchen.document_loader.load_from_github(github_repo)\r\n```\r\n\r\n## Data Processing\r\n\r\nNow that we have collected data from various sources, let's process it into the desired JSON structure:\r\n\r\n```python\r\ndef process_data(data, source_type):\r\n processed_data = []\r\n for item in data:\r\n processed_item = {\r\n \"task\": \"command_description\",\r\n \"instruction\": \"You are a function description specialist for Ollama Agent Roll Cage.\",\r\n \"input\": f\"Please explain what the command does in the context of {source_type}.\",\r\n \"output\": item.get('body', item.get('content', str(item))),\r\n \"command\": f\"/{source_type}_command\"\r\n }\r\n processed_data.append(processed_item)\r\n return processed_data\r\n\r\n# Process data from each source\r\nhf_processed = process_data(hf_data, \"huggingface\")\r\nwiki_processed = process_data(wiki_data, \"wikipedia\")\r\nreddit_processed = process_data(reddit_data.to_dict('records'), \"reddit\")\r\narxiv_processed = process_data(arxiv_data, \"arxiv\")\r\ngithub_processed = process_data(github_data, \"github\")\r\n\r\n# Combine all processed data\r\nall_processed_data = hf_processed + wiki_processed + reddit_processed + arxiv_processed + github_processed\r\n```\r\n\r\n## Data Pipeline\r\n\r\nNow, let's run the combined data through the agentChef pipeline:\r\n\r\n```python\r\n# Convert to DataFrame\r\ndf = pd.DataFrame(all_processed_data)\r\n\r\n# Clean data\r\ncleaned_data = kitchen.clean_data(df)\r\n\r\n# Augment data\r\naugmentation_config = {\r\n 'instruction': {'type': 'paraphrase'},\r\n 'input': {'type': 'paraphrase'},\r\n 'output': {'type': 'paraphrase'}\r\n}\r\naugmented_data = kitchen.augment_data(cleaned_data, augmentation_config)\r\n\r\n# Generate synthetic data\r\nnum_samples = 100\r\nsynthetic_data = kitchen.generate_synthetic_data(augmented_data, num_samples, augmentation_config)\r\n\r\n# Final cleaning\r\nfinal_data = kitchen.clean_data(synthetic_data)\r\n```\r\n\r\n## Saving and Pushing to Hugging Face\r\n\r\nSave the final data as JSON and Parquet:\r\n\r\n```python\r\n# Save as JSON\r\njson_output = 'final_dataset.json'\r\nfinal_data.to_json(json_output, orient='records', indent=2)\r\nprint(f\"Saved JSON to {json_output}\")\r\n\r\n# Save as Parquet\r\nparquet_output = 'final_dataset.parquet'\r\nkitchen.file_handler.save_to_parquet(final_data, parquet_output)\r\nprint(f\"Saved Parquet to {parquet_output}\")\r\n```\r\n\r\nOptionally, push the Parquet file to Hugging Face:\r\n\r\n```python\r\nfrom huggingface_hub import HfApi\r\n\r\ndef push_to_huggingface(file_path, repo_id, token):\r\n api = HfApi()\r\n api.upload_file(\r\n path_or_fileobj=file_path,\r\n path_in_repo=file_path,\r\n repo_id=repo_id,\r\n token=token\r\n )\r\n\r\n# Push to Hugging Face\r\nhf_token = \"your_huggingface_token\"\r\nhf_repo = \"your_username/your_dataset_repo\"\r\npush_to_huggingface(parquet_output, hf_repo, hf_token)\r\nprint(f\"Pushed {parquet_output} to Hugging Face repository: {hf_repo}\")\r\n```\r\n\r\n## Complete Example\r\n\r\nHere's a complete script that puts all these steps together:\r\n\r\n```python\r\nfrom agentChef import DatasetKitchen\r\nfrom agentChef.cutlery import CustomAgentBase\r\nimport pandas as pd\r\nimport praw\r\nfrom huggingface_hub import HfApi\r\n\r\n# Configuration\r\nconfig = {\r\n 'templates_dir': './templates',\r\n 'input_dir': './input',\r\n 'output_dir': './output',\r\n 'github_access_token': 'your_github_token_if_needed'\r\n}\r\n\r\n# Initialize DatasetKitchen\r\nkitchen = DatasetKitchen(config)\r\n\r\n# Define Reddit Agent\r\nclass RedditAgent(CustomAgentBase):\r\n def __init__(self, *args, **kwargs):\r\n super().__init__(*args, **kwargs)\r\n self.reddit = praw.Reddit(client_id='YOUR_CLIENT_ID',\r\n client_secret='YOUR_CLIENT_SECRET',\r\n user_agent='YOUR_USER_AGENT')\r\n\r\n def scrape_data(self, source: str) -> List[Dict[str, Any]]:\r\n submission = self.reddit.submission(url=source)\r\n submission.comments.replace_more(limit=None)\r\n return [{'body': comment.body, 'score': comment.score} for comment in submission.comments.list()]\r\n\r\n def process_data(self, data: List[Dict[str, Any]]) -> pd.DataFrame:\r\n return pd.DataFrame(data)\r\n\r\n# Register Reddit Agent\r\nkitchen.register_custom_agent('reddit', RedditAgent)\r\n\r\n# Data Collection\r\nhf_data = kitchen.document_loader.load_from_huggingface(\"https://huggingface.co/datasets/your_dataset\")\r\nwiki_data = kitchen.document_loader.load_from_wikipedia(\"Artificial Intelligence\")\r\nreddit_agent = kitchen.get_agent('reddit')\r\nreddit_data = reddit_agent.run('https://www.reddit.com/r/AskReddit/comments/example_post/')\r\narxiv_data = kitchen.document_loader.load_from_arxiv(\"machine learning\")\r\ngithub_data = kitchen.document_loader.load_from_github(\"username/repo\")\r\n\r\n# Data Processing\r\ndef process_data(data, source_type):\r\n processed_data = []\r\n for item in data:\r\n processed_item = {\r\n \"task\": \"command_description\",\r\n \"instruction\": \"You are a function description specialist for Ollama Agent Roll Cage.\",\r\n \"input\": f\"Please explain what the command does in the context of {source_type}.\",\r\n \"output\": item.get('body', item.get('content', str(item))),\r\n \"command\": f\"/{source_type}_command\"\r\n }\r\n processed_data.append(processed_item)\r\n return processed_data\r\n\r\nhf_processed = process_data(hf_data, \"huggingface\")\r\nwiki_processed = process_data(wiki_data, \"wikipedia\")\r\nreddit_processed = process_data(reddit_data.to_dict('records'), \"reddit\")\r\narxiv_processed = process_data(arxiv_data, \"arxiv\")\r\ngithub_processed = process_data(github_data, \"github\")\r\n\r\nall_processed_data = hf_processed + wiki_processed + reddit_processed + arxiv_processed + github_processed\r\n\r\n# Data Pipeline\r\ndf = pd.DataFrame(all_processed_data)\r\ncleaned_data = kitchen.clean_data(df)\r\n\r\naugmentation_config = {\r\n 'instruction': {'type': 'paraphrase'},\r\n 'input': {'type': 'paraphrase'},\r\n 'output': {'type': 'paraphrase'}\r\n}\r\naugmented_data = kitchen.augment_data(cleaned_data, augmentation_config)\r\n\r\nnum_samples = 100\r\nsynthetic_data = kitchen.generate_synthetic_data(augmented_data, num_samples, augmentation_config)\r\n\r\nfinal_data = kitchen.clean_data(synthetic_data)\r\n\r\n# Save and Push to Hugging Face\r\njson_output = 'final_dataset.json'\r\nfinal_data.to_json(json_output, orient='records', indent=2)\r\nprint(f\"Saved JSON to {json_output}\")\r\n\r\nparquet_output = 'final_dataset.parquet'\r\nkitchen.file_handler.save_to_parquet(final_data, parquet_output)\r\nprint(f\"Saved Parquet to {parquet_output}\")\r\n\r\ndef push_to_huggingface(file_path, repo_id, token):\r\n api = HfApi()\r\n api.upload_file(\r\n path_or_fileobj=file_path,\r\n path_in_repo=file_path,\r\n repo_id=repo_id,\r\n token=token\r\n )\r\n\r\nhf_token = \"your_huggingface_token\"\r\nhf_repo = \"your_username/your_dataset_repo\"\r\npush_to_huggingface(parquet_output, hf_repo, hf_token)\r\nprint(f\"Pushed {parquet_output} to Hugging Face repository: {hf_repo}\")\r\n```\r\n\r\nThis comprehensive guide demonstrates how to use the agentChef package to collect data from various sources, process it into a structured format, run it through a data processing pipeline, and finally save and push the results to Hugging Face. You can adapt this guide to your specific needs and data sources.\r\n",
"bugtrack_url": null,
"license": null,
"summary": "A tool for collecting, processing, and generating datasets for AI training",
"version": "0.1.5",
"project_urls": {
"Homepage": "https://github.com/leoleojames1/agentChef"
},
"split_keywords": [],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "c982b2367ddaa30919d949543979231b59b7a41f31fdc7cb3611f27444d5236c",
"md5": "bd227addacbaccee45bc4d2f6ee40da2",
"sha256": "18b6166f43f23a8d7ce79cdad059e5c4be56f7c3cbcfb60669a4cee21e960912"
},
"downloads": -1,
"filename": "agentChef-0.1.5-py3-none-any.whl",
"has_sig": false,
"md5_digest": "bd227addacbaccee45bc4d2f6ee40da2",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": ">=3.6",
"size": 3701,
"upload_time": "2024-09-25T11:42:02",
"upload_time_iso_8601": "2024-09-25T11:42:02.643600Z",
"url": "https://files.pythonhosted.org/packages/c9/82/b2367ddaa30919d949543979231b59b7a41f31fdc7cb3611f27444d5236c/agentChef-0.1.5-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "3170c2cb5e139b5e12810578fe99b73aab47bbacbe1263c307cfe820ffb67b4d",
"md5": "57462a4a8643a7f57b42f8056fb423b0",
"sha256": "4568812723ee172661710affa8f1f4f35228ba3ca5d7f3b1df11ecc24a2c93d5"
},
"downloads": -1,
"filename": "agentchef-0.1.5.tar.gz",
"has_sig": false,
"md5_digest": "57462a4a8643a7f57b42f8056fb423b0",
"packagetype": "sdist",
"python_version": "source",
"requires_python": ">=3.6",
"size": 3905,
"upload_time": "2024-09-25T11:42:03",
"upload_time_iso_8601": "2024-09-25T11:42:03.705379Z",
"url": "https://files.pythonhosted.org/packages/31/70/c2cb5e139b5e12810578fe99b73aab47bbacbe1263c307cfe820ffb67b4d/agentchef-0.1.5.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2024-09-25 11:42:03",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "leoleojames1",
"github_project": "agentChef",
"travis_ci": false,
"coveralls": false,
"github_actions": false,
"lcname": "agentchef"
}