# Prompt flow evaluators
[![Python package](https://img.shields.io/pypi/v/promptflow-evals)](https://pypi.org/project/promptflow-evals/)
[![License: MIT](https://img.shields.io/github/license/microsoft/promptflow)](https://github.com/microsoft/promptflow/blob/main/LICENSE)
## Introduction
Evaluators are custom or prebuilt promptflow flows that are designed to measure the quality of the outputs from language models.
## Usage
Users can create evaluator runs on the local machine as shown in the example below:
```python
import os
from pprint import pprint
from promptflow.core import AzureOpenAIModelConfiguration
from promptflow.evals.evaluate import evaluate
from promptflow.evals.evaluators import RelevanceEvaluator
from promptflow.evals.evaluators.content_safety import ViolenceEvaluator
def answer_length(answer, **kwargs):
return {"value": len(answer)}
if __name__ == "__main__":
# Built-in evaluators
# Initialize Azure OpenAI Connection
model_config = AzureOpenAIModelConfiguration(
azure_endpoint=os.environ.get("AZURE_OPENAI_ENDPOINT"),
api_key=os.environ.get("AZURE_OPENAI_KEY"),
azure_deployment=os.environ.get("AZURE_OPENAI_DEPLOYMENT"),
)
# Initialzing Relevance Evaluator
relevance_eval = RelevanceEvaluator(model_config)
# Running Relevance Evaluator on single input row
relevance_score = relevance_eval(
answer="The Alpine Explorer Tent is the most waterproof.",
context="From the our product list,"
" the alpine explorer tent is the most waterproof."
" The Adventure Dining Table has higher weight.",
question="Which tent is the most waterproof?",
)
pprint(relevance_score)
# {'gpt_relevance': 5.0}
# Content Safety Evaluator
# Initialize Project Scope
project_scope = {
"subscription_id": "e0fd569c-e34a-4249-8c24-e8d723c7f054",
"resource_group_name": "rg-test",
"project_name": "project-test",
}
violence_eval = ViolenceEvaluator(project_scope)
violence_score = violence_eval(question="What is the capital of France?", answer="Paris.")
pprint(violence_score)
# {'violence': 'Very low',
# 'violence_reason': "The system's response is a straightforward factual answer "
# 'to a geography question. There is no violent content or '
# 'language present.',
# 'violence_score': 0}
# Code based evaluator
answer_length("The Alpine Explorer Tent is the most waterproof.")
# {'value': 48}
# Using multiple evaluators together using `Evaluate` API
result = evaluate(
data="evaluate_test_data.jsonl",
evaluators={
"answer_length": answer_length,
"violence": violence_eval,
},
)
pprint(result)
```
Raw data
{
"_id": null,
"home_page": "https://microsoft.github.io/promptflow/",
"name": "promptflow-evals",
"maintainer": null,
"docs_url": null,
"requires_python": "<4.0,>=3.8",
"maintainer_email": null,
"keywords": "telemetry",
"author": "Microsoft Corporation",
"author_email": "azuremlsdk@microsoft.com",
"download_url": null,
"platform": null,
"description": "# Prompt flow evaluators\n\n[![Python package](https://img.shields.io/pypi/v/promptflow-evals)](https://pypi.org/project/promptflow-evals/)\n[![License: MIT](https://img.shields.io/github/license/microsoft/promptflow)](https://github.com/microsoft/promptflow/blob/main/LICENSE)\n\n## Introduction\nEvaluators are custom or prebuilt promptflow flows that are designed to measure the quality of the outputs from language models.\n\n## Usage\nUsers can create evaluator runs on the local machine as shown in the example below:\n\n```python\nimport os\nfrom pprint import pprint\n\nfrom promptflow.core import AzureOpenAIModelConfiguration\nfrom promptflow.evals.evaluate import evaluate\nfrom promptflow.evals.evaluators import RelevanceEvaluator\nfrom promptflow.evals.evaluators.content_safety import ViolenceEvaluator\n\n\ndef answer_length(answer, **kwargs):\n return {\"value\": len(answer)}\n\n\nif __name__ == \"__main__\":\n # Built-in evaluators\n # Initialize Azure OpenAI Connection\n model_config = AzureOpenAIModelConfiguration(\n azure_endpoint=os.environ.get(\"AZURE_OPENAI_ENDPOINT\"),\n api_key=os.environ.get(\"AZURE_OPENAI_KEY\"),\n azure_deployment=os.environ.get(\"AZURE_OPENAI_DEPLOYMENT\"),\n )\n\n # Initialzing Relevance Evaluator\n relevance_eval = RelevanceEvaluator(model_config)\n\n # Running Relevance Evaluator on single input row\n relevance_score = relevance_eval(\n answer=\"The Alpine Explorer Tent is the most waterproof.\",\n context=\"From the our product list,\"\n \" the alpine explorer tent is the most waterproof.\"\n \" The Adventure Dining Table has higher weight.\",\n question=\"Which tent is the most waterproof?\",\n )\n\n pprint(relevance_score)\n # {'gpt_relevance': 5.0}\n\n # Content Safety Evaluator\n\n # Initialize Project Scope\n project_scope = {\n \"subscription_id\": \"e0fd569c-e34a-4249-8c24-e8d723c7f054\",\n \"resource_group_name\": \"rg-test\",\n \"project_name\": \"project-test\",\n }\n\n violence_eval = ViolenceEvaluator(project_scope)\n violence_score = violence_eval(question=\"What is the capital of France?\", answer=\"Paris.\")\n pprint(violence_score)\n # {'violence': 'Very low',\n # 'violence_reason': \"The system's response is a straightforward factual answer \"\n # 'to a geography question. There is no violent content or '\n # 'language present.',\n # 'violence_score': 0}\n\n # Code based evaluator\n answer_length(\"The Alpine Explorer Tent is the most waterproof.\")\n # {'value': 48}\n\n # Using multiple evaluators together using `Evaluate` API\n\n result = evaluate(\n data=\"evaluate_test_data.jsonl\",\n evaluators={\n \"answer_length\": answer_length,\n \"violence\": violence_eval,\n },\n )\n\n pprint(result)\n```\n\n",
"bugtrack_url": null,
"license": "MIT",
"summary": "Prompt flow evals",
"version": "0.3.2",
"project_urls": {
"Bug Reports": "https://github.com/microsoft/promptflow/issues",
"Homepage": "https://microsoft.github.io/promptflow/",
"Repository": "https://github.com/microsoft/promptflow"
},
"split_keywords": [
"telemetry"
],
"urls": [
{
"comment_text": "",
"digests": {
"blake2b_256": "bf52635b858c199b7be1ba7649b342168a0440be7d7e9496403ebf150c1fe11d",
"md5": "6c6ee3d26b4ef0458d4429326013b970",
"sha256": "4a07f85db9b3564b654e5c380360c699fbc470acd2e15046c1b2f78df1730cb6"
},
"downloads": -1,
"filename": "promptflow_evals-0.3.2-py3-none-any.whl",
"has_sig": false,
"md5_digest": "6c6ee3d26b4ef0458d4429326013b970",
"packagetype": "bdist_wheel",
"python_version": "py3",
"requires_python": "<4.0,>=3.8",
"size": 113065,
"upload_time": "2024-08-13T21:00:01",
"upload_time_iso_8601": "2024-08-13T21:00:01.175157Z",
"url": "https://files.pythonhosted.org/packages/bf/52/635b858c199b7be1ba7649b342168a0440be7d7e9496403ebf150c1fe11d/promptflow_evals-0.3.2-py3-none-any.whl",
"yanked": false,
"yanked_reason": null
}
],
"upload_time": "2024-08-13 21:00:01",
"github": true,
"gitlab": false,
"bitbucket": false,
"codeberg": false,
"github_user": "microsoft",
"github_project": "promptflow",
"travis_ci": false,
"coveralls": false,
"github_actions": true,
"lcname": "promptflow-evals"
}