# Mixedbread AI Haystack 2.0 Integration
[![PyPI version](https://badge.fury.io/py/mixedbread-ai-haystack.svg)](https://badge.fury.io/py/mixedbread-ai-haystack)
[![Python versions](https://img.shields.io/pypi/pyversions/mixedbread-ai-haystack.svg)](https://pypi.org/project/mixedbread-ai-haystack/)
### **Table of Contents**
- [Overview](#overview)
- [Installation](#installation)
- [Usage](#usage)
## Overview
[mixedbread ai](https://www.mixedbread.ai) is an AI start-up that provides open-source, as well as, in-house embedding and reranking models. You can choose from various foundation models to find the one best suited for your use case. More information can be found on the [documentation page](https://www.mixedbread.ai/api-reference/integrations#haystack).
## Installation
Install the Mixedbread AI integration with a simple pip command:
```bash
pip install mixedbread-ai-haystack
```
## Usage
This integration comes with 3 components:
- [`MixedbreadAITextEmbedder`](https://github.com/mixedbread-ai/mixedbread-ai-haystack/blob/main/mixedbread_ai_haystack/embedders/text_embedder.py)
- [`MixedbreadAIDocumentEmbedder`](https://github.com/mixedbread-ai/mixedbread-ai-haystack/blob/main/mixedbread_ai_haystack/embedders/document_embedder.py).
- [`MixedbreadAIReranker`](https://github.com/mixedbread-ai/mixedbread-ai-haystack/blob/main/mixedbread_ai_haystack/rerankers/reranker.py)
For documents you can use `MixedbreadAIDocumentEmbedder` and for queries you can use `MixedbreadAITextEmbedder`. Once you've selected the component for your specific use case, initialize the component with the `model` and the [`api_key`](https://www.mixedbread.ai/dashboard?next=api-keys). You can also set the environment variable `MXBAI_API_KEY` instead of passing the api key as an argument.
### Embedders In a Pipeline
```python
from haystack import Document, Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.writers import DocumentWriter
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from mixedbread_ai_haystack.embedders import MixedbreadAIDocumentEmbedder, MixedbreadAITextEmbedder
# Set-up the Document Store and Documents
document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
documents = [
Document(content="china is the most populous country in the world."),
Document(content="india is the second most populous country in the world."),
Document(content="united states is the third most populous country in the world.")
]
# Indexing Pipeline
indexing_pipeline = Pipeline()
indexing_pipeline.add_component("doc_embedder", MixedbreadAIDocumentEmbedder(model="mixedbread-ai/mxbai-embed-large-v1"))
indexing_pipeline.add_component("writer", DocumentWriter(document_store=document_store))
indexing_pipeline.connect("doc_embedder", "writer")
indexing_pipeline.run({"doc_embedder": {"documents": documents}})
# Query Pipeline
text_embedder = MixedbreadAITextEmbedder(model="mixedbread-ai/mxbai-embed-large-v1")
query_pipeline = Pipeline()
query_pipeline.add_component("text_embedder", text_embedder)
query_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store))
query_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
results = query_pipeline.run({"text_embedder": {"text": "Which country has the biggest population?"}})
top_document = results["retriever"]["documents"][0].content
print(top_document)
```
### Reranker In a Pipeline
```python
from haystack import Document, Pipeline
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from mixedbread_ai_haystack.rerankers import MixedbreadAIReranker
# Set-up the Document Store and Documents
documents = [
Document(content="china is the most populous country in the world."),
Document(content="india is the second most populous country in the world."),
Document(content="united states is the third most populous country in the world.")
]
document_store = InMemoryDocumentStore()
document_store.write_documents(documents)
# Define the Retriever and Reranker
retriever = InMemoryBM25Retriever(document_store=document_store)
reranker = MixedbreadAIReranker(model="mixedbread-ai/mxbai-rerank-large-v1", top_k=3)
# Rerank Pipeline
rerank_pipeline = Pipeline()
rerank_pipeline.add_component("retriever", retriever)
rerank_pipeline.add_component("reranker", reranker)
rerank_pipeline.connect("retriever.documents", "reranker.documents")
# Query and Rerank
query = "Which country has the second largest population"
results = rerank_pipeline.run({"retriever": {"query": query}, "reranker": {"query": query, "top_k": 3}})
print(results)
```
### Full Example With Metadata
```python
import os
from datasets import load_dataset
from haystack import Pipeline, Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.writers import DocumentWriter
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from mixedbread_ai_haystack import MixedbreadAIDocumentEmbedder, MixedbreadAITextEmbedder, MixedbreadAIReranker
# Set API Key
os.environ["MXBAI_API_KEY"] = "YOUR_API_KEY"
# Load the Dataset and Prepare Documents
ds = load_dataset("rajuptvs/ecommerce_products_clip")
documents = [
Document(
id=str(i),
content=data["Description"], meta={
"name": data["Product_name"],
"price": data["Price"],
"colors": data["colors"],
"pattern": data["Pattern"],
"extra": data["Other Details"]
}) for i, data in enumerate(ds["train"])
]
meta_fields = documents[0].meta.keys()
# Define the Components
document_store = InMemoryDocumentStore(embedding_similarity_function="cosine")
document_writer = DocumentWriter(document_store=document_store)
embedding_retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=20)
embed_model = "mixedbread-ai/mxbai-embed-large-v1"
reranking_model = "mixedbread-ai/mxbai-rerank-large-v1"
text_embedder = MixedbreadAITextEmbedder(model=embed_model)
document_embedder = MixedbreadAIDocumentEmbedder(model=embed_model, max_concurrency=3, meta_fields_to_embed=meta_fields, show_progress_bar=True)
reranker = MixedbreadAIReranker(model=reranking_model, meta_fields_to_rank=meta_fields, top_k=5)
# Indexing Pipeline
indexing_pipeline = Pipeline()
indexing_pipeline.add_component(instance=document_embedder, name="document_embedder")
indexing_pipeline.add_component(instance=document_writer, name="document_writer")
indexing_pipeline.connect("document_embedder", "document_writer")
# Query Pipeline
query_pipeline = Pipeline()
query_pipeline.add_component(instance=text_embedder, name="text_embedder")
query_pipeline.add_component(instance=embedding_retriever, name="embedding_retriever")
query_pipeline.add_component(instance=reranker, name="reranker")
query_pipeline.connect("text_embedder", "embedding_retriever")
query_pipeline.connect("embedding_retriever.documents", "reranker.documents")
# Index the dataset
indexing_pipeline.run({"document_embedder": {"documents": documents}})
# Query to get results
query = "I am looking for a regular fit t-shirt in blue color. Ideally without any prints. What are my options?"
results = query_pipeline.run(
{
"text_embedder": {"text": query},
"reranker": {"query": query}
}
)
print(results["reranker"]["documents"])
```
Raw data
{
"_id": null,
"home_page": "https://github.com/mixedbread-ai/mixedbread-ai-haystack.git",
"name": "mixedbread-ai-haystack",
"maintainer": null,
"docs_url": null,
"requires_python": "<4.0,>=3.8",
"maintainer_email": null,
"keywords": "Embeddings, Rerank, Search, NLP, mixedbread.ai, mixedbread ai, Haystack, deepset",
"author": "Mixedbread AI",
"author_email": "support@mixedbread.ai",
"download_url": "https://files.pythonhosted.org/packages/fd/08/10b26f365309b0362fc2f39d858a81528a39a92fc5663c6d0f28bdefd160/mixedbread_ai_haystack-2.0.2.tar.gz",
"platform": null,
"description": "# Mixedbread AI Haystack 2.0 Integration\n[![PyPI version](https://badge.fury.io/py/mixedbread-ai-haystack.svg)](https://badge.fury.io/py/mixedbread-ai-haystack)\n[![Python versions](https://img.shields.io/pypi/pyversions/mixedbread-ai-haystack.svg)](https://pypi.org/project/mixedbread-ai-haystack/) \n\n### **Table of Contents**\n\n- [Overview](#overview)\n- [Installation](#installation)\n- [Usage](#usage)\n\n## Overview\n\n[mixedbread ai](https://www.mixedbread.ai) is an AI start-up that provides open-source, as well as, in-house embedding and reranking models. You can choose from various foundation models to find the one best suited for your use case. More information can be found on the [documentation page](https://www.mixedbread.ai/api-reference/integrations#haystack).\n\n## Installation\n\nInstall the Mixedbread AI integration with a simple pip command:\n\n```bash\npip install mixedbread-ai-haystack\n```\n\n## Usage\n\nThis integration comes with 3 components:\n- [`MixedbreadAITextEmbedder`](https://github.com/mixedbread-ai/mixedbread-ai-haystack/blob/main/mixedbread_ai_haystack/embedders/text_embedder.py)\n- [`MixedbreadAIDocumentEmbedder`](https://github.com/mixedbread-ai/mixedbread-ai-haystack/blob/main/mixedbread_ai_haystack/embedders/document_embedder.py).\n- [`MixedbreadAIReranker`](https://github.com/mixedbread-ai/mixedbread-ai-haystack/blob/main/mixedbread_ai_haystack/rerankers/reranker.py)\n\nFor documents you can use `MixedbreadAIDocumentEmbedder` and for queries you can use `MixedbreadAITextEmbedder`. Once you've selected the component for your specific use case, initialize the component with the `model` and the [`api_key`](https://www.mixedbread.ai/dashboard?next=api-keys). You can also set the environment variable `MXBAI_API_KEY` instead of passing the api key as an argument.\n\n### Embedders In a Pipeline\n\n```python\nfrom haystack import Document, Pipeline\nfrom haystack.document_stores.in_memory import InMemoryDocumentStore\nfrom haystack.components.writers import DocumentWriter\nfrom haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever\nfrom mixedbread_ai_haystack.embedders import MixedbreadAIDocumentEmbedder, MixedbreadAITextEmbedder\n\n# Set-up the Document Store and Documents\ndocument_store = InMemoryDocumentStore(embedding_similarity_function=\"cosine\")\ndocuments = [\n Document(content=\"china is the most populous country in the world.\"), \n Document(content=\"india is the second most populous country in the world.\"), \n Document(content=\"united states is the third most populous country in the world.\")\n]\n\n# Indexing Pipeline\nindexing_pipeline = Pipeline()\nindexing_pipeline.add_component(\"doc_embedder\", MixedbreadAIDocumentEmbedder(model=\"mixedbread-ai/mxbai-embed-large-v1\"))\nindexing_pipeline.add_component(\"writer\", DocumentWriter(document_store=document_store))\nindexing_pipeline.connect(\"doc_embedder\", \"writer\")\n\nindexing_pipeline.run({\"doc_embedder\": {\"documents\": documents}})\n\n# Query Pipeline\ntext_embedder = MixedbreadAITextEmbedder(model=\"mixedbread-ai/mxbai-embed-large-v1\")\nquery_pipeline = Pipeline()\nquery_pipeline.add_component(\"text_embedder\", text_embedder)\nquery_pipeline.add_component(\"retriever\", InMemoryEmbeddingRetriever(document_store=document_store))\nquery_pipeline.connect(\"text_embedder.embedding\", \"retriever.query_embedding\")\n\nresults = query_pipeline.run({\"text_embedder\": {\"text\": \"Which country has the biggest population?\"}})\ntop_document = results[\"retriever\"][\"documents\"][0].content\nprint(top_document)\n```\n\n### Reranker In a Pipeline\n```python\nfrom haystack import Document, Pipeline\nfrom haystack.document_stores.in_memory import InMemoryDocumentStore\nfrom haystack.components.retrievers.in_memory import InMemoryBM25Retriever\nfrom mixedbread_ai_haystack.rerankers import MixedbreadAIReranker\n\n# Set-up the Document Store and Documents\ndocuments = [\n Document(content=\"china is the most populous country in the world.\"),\n Document(content=\"india is the second most populous country in the world.\"),\n Document(content=\"united states is the third most populous country in the world.\")\n]\ndocument_store = InMemoryDocumentStore()\ndocument_store.write_documents(documents)\n\n# Define the Retriever and Reranker\nretriever = InMemoryBM25Retriever(document_store=document_store)\nreranker = MixedbreadAIReranker(model=\"mixedbread-ai/mxbai-rerank-large-v1\", top_k=3)\n\n# Rerank Pipeline\nrerank_pipeline = Pipeline()\nrerank_pipeline.add_component(\"retriever\", retriever)\nrerank_pipeline.add_component(\"reranker\", reranker)\nrerank_pipeline.connect(\"retriever.documents\", \"reranker.documents\")\n\n# Query and Rerank\nquery = \"Which country has the second largest population\"\nresults = rerank_pipeline.run({\"retriever\": {\"query\": query}, \"reranker\": {\"query\": query, \"top_k\": 3}})\nprint(results)\n```\n\n### Full Example With Metadata\n```python\nimport os\nfrom datasets import load_dataset\nfrom haystack import Pipeline, Document\nfrom haystack.document_stores.in_memory import InMemoryDocumentStore\nfrom haystack.components.writers import DocumentWriter\nfrom haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever\nfrom mixedbread_ai_haystack import MixedbreadAIDocumentEmbedder, MixedbreadAITextEmbedder, MixedbreadAIReranker\n\n# Set API Key\nos.environ[\"MXBAI_API_KEY\"] = \"YOUR_API_KEY\"\n\n# Load the Dataset and Prepare Documents\nds = load_dataset(\"rajuptvs/ecommerce_products_clip\")\ndocuments = [\n Document(\n id=str(i),\n content=data[\"Description\"], meta={\n \"name\": data[\"Product_name\"],\n \"price\": data[\"Price\"],\n \"colors\": data[\"colors\"],\n \"pattern\": data[\"Pattern\"],\n \"extra\": data[\"Other Details\"]\n }) for i, data in enumerate(ds[\"train\"])\n]\nmeta_fields = documents[0].meta.keys()\n\n# Define the Components\ndocument_store = InMemoryDocumentStore(embedding_similarity_function=\"cosine\")\ndocument_writer = DocumentWriter(document_store=document_store)\nembedding_retriever = InMemoryEmbeddingRetriever(document_store=document_store, top_k=20)\n\nembed_model = \"mixedbread-ai/mxbai-embed-large-v1\"\nreranking_model = \"mixedbread-ai/mxbai-rerank-large-v1\" \n\ntext_embedder = MixedbreadAITextEmbedder(model=embed_model)\ndocument_embedder = MixedbreadAIDocumentEmbedder(model=embed_model, max_concurrency=3, meta_fields_to_embed=meta_fields, show_progress_bar=True)\nreranker = MixedbreadAIReranker(model=reranking_model, meta_fields_to_rank=meta_fields, top_k=5)\n\n# Indexing Pipeline\nindexing_pipeline = Pipeline()\nindexing_pipeline.add_component(instance=document_embedder, name=\"document_embedder\")\nindexing_pipeline.add_component(instance=document_writer, name=\"document_writer\")\nindexing_pipeline.connect(\"document_embedder\", \"document_writer\")\n\n# Query Pipeline\nquery_pipeline = Pipeline()\nquery_pipeline.add_component(instance=text_embedder, name=\"text_embedder\")\nquery_pipeline.add_component(instance=embedding_retriever, name=\"embedding_retriever\")\nquery_pipeline.add_component(instance=reranker, name=\"reranker\")\nquery_pipeline.connect(\"text_embedder\", \"embedding_retriever\")\nquery_pipeline.connect(\"embedding_retriever.documents\", \"reranker.documents\")\n\n# Index the dataset\nindexing_pipeline.run({\"document_embedder\": {\"documents\": documents}})\n\n# Query to get results\nquery = \"I am looking for a regular fit t-shirt in blue color. Ideally without any prints. What are my options?\"\nresults = query_pipeline.run(\n {\n \"text_embedder\": {\"text\": query},\n \"reranker\": {\"query\": query}\n }\n)\nprint(results[\"reranker\"][\"documents\"])\n```\n",
"bugtrack_url": null,
"license": "Apache-2.0",
"summary": "Mixedbread AI (https://www.mixedbread.ai) Haystack integration",
"version": "2.0.2",
"project_urls": {
"Homepage": "https://github.com/mixedbread-ai/mixedbread-ai-haystack.git",
"Repository": "https://github.com/mixedbread-ai/mixedbread-ai-haystack.git"
},
"split_keywords": [
"embeddings",
" rerank",
" search",
" nlp",
" mixedbread.ai",
" mixedbread ai",
" haystack",
" deepset"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "f6dc7a19045a4843af05a9b22b3818fec1b7ffa873c843960dcaf12ef7301222",
"md5": "8398bdc65a30fd740a85d5c88dc40f3f",
"sha256": "b964a9b5b4e01c8eef6b16f38e5a04afe83195b4eb2b527e443a6627a0888646"
},
"downloads": -1,
"filename": "mixedbread_ai_haystack-2.0.2-py3-none-any.whl",
"has_sig": false,
"md5_digest": "8398bdc65a30fd740a85d5c88dc40f3f",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": "<4.0,>=3.8",
"size": 11639,
"upload_time": "2024-07-02T12:09:35",
"upload_time_iso_8601": "2024-07-02T12:09:35.733894Z",
"url": "https://files.pythonhosted.org/packages/f6/dc/7a19045a4843af05a9b22b3818fec1b7ffa873c843960dcaf12ef7301222/mixedbread_ai_haystack-2.0.2-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
},
{
"comment_text": "",
"digests": {
"blake2b_256": "fd0810b26f365309b0362fc2f39d858a81528a39a92fc5663c6d0f28bdefd160",
"md5": "0368d37f91e03e79f640e911b4fba151",
"sha256": "3fee7be8c8468a952766a318d39c94f21760ac2436214e229041a474e7d45ce1"
},
"downloads": -1,
"filename": "mixedbread_ai_haystack-2.0.2.tar.gz",
"has_sig": false,
"md5_digest": "0368d37f91e03e79f640e911b4fba151",
"packagetype": "sdist",
"python_version": "source",
"requires_python": "<4.0,>=3.8",
"size": 9731,
"upload_time": "2024-07-02T12:09:37",
"upload_time_iso_8601": "2024-07-02T12:09:37.169918Z",
"url": "https://files.pythonhosted.org/packages/fd/08/10b26f365309b0362fc2f39d858a81528a39a92fc5663c6d0f28bdefd160/mixedbread_ai_haystack-2.0.2.tar.gz",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2024-07-02 12:09:37",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "mixedbread-ai",
"github_project": "mixedbread-ai-haystack",
"travis_ci": false,
"coveralls": false,
"github_actions": true,
"lcname": "mixedbread-ai-haystack"
}