lastmile-eval


Namelastmile-eval JSON
Version 0.0.59 PyPI version JSON
download
home_pageNone
SummaryAn API to measure evaluation criteria (ex: faithfulness) of generative AI outputs
upload_time2024-06-12 21:54:07
maintainerNone
docs_urlNone
authorLastMile AI
requires_python>=3.10
licenseNone
keywords
VCS
bugtrack_url
requirements No requirements were recorded.
Travis-CI No Travis.
coveralls test coverage No coveralls.
            # LastMile AI Eval

Library of tools to evaluate your RAG system.

## Setup

1. Get a LastMile API token (see section below)
2. Install this library: `pip install lastmile-eval`
3. Gather your data that needs evaluation
4. Usage: see examples below.

## LastMile API token

To get a LastMile AI token, please go to the [LastMile token's webpage](https://lastmileai.dev/settings?page=tokens).
You can create an account with Google or Github and then click the "Create new token" in the "API Tokens" section. Once a token is created, be sure to save it somewhere since you won't be able to see the value of it from the website again (though you can create a new one if that happens).

**Please be careful not to share your token on GitHub**. Instead we recommend saving it under your project’s (or home directory) `.env` file as: `LASTMILE_API_TOKEN=<TOKEN_HERE>`, and use `dotenv.load_dotenv()` instead. See the `examples/` folder for how to do this.

## LLM Provider Tokens (`.env` file)

In order to use LLM-based evaluators, add your other API tokens to your .env file.
Example: `OPENAI_API_KEY=<TOKEN_HERE>`

# Examples

## Example 1: RAG Evaluation Script

```python
"""The RAG evaluation API runs evaluation criteria (ex: faithfulness)
    of generative AI outputs from a RAG system.

Particularly, we evaluate based on this triplet of information:

1. User query
2. Data that goes into the LLM
3. LLM's output response

The  `get_rag_eval_scores()` function returns a faithfulness score from 0 to 1.
"""

import os
import sys
from textwrap import dedent

import dotenv
import pandas as pd

from lastmile_eval.rag import get_rag_eval_scores


def main():
    rag_scores_example_1()
    rag_scores_example_2()
    return 0


def get_lastmile_api_token():
    """See README.md for mor information."""
    dotenv.load_dotenv()
    api_token = os.getenv("LASTMILE_API_TOKEN")
    assert api_token is not None
    return api_token


def rag_scores_example_1():
    print("\n\nRAG scores example 1:")
    statement1 = "the sky is red"
    statement2 = "the sky is blue"

    queries = ["what color is the sky?", "is the sky blue?"]
    data = [statement1, statement1]
    responses = [statement1, statement2]
    api_token = get_lastmile_api_token()
    result = get_rag_eval_scores(
        queries,
        data,
        responses,
        api_token,
    )

    print("Result: ", result)

    # result will look something like:
    # {'p_faithful': [0.9955534338951111, 6.857347034383565e-05]}


def rag_scores_example_2():
    print("\n\nRAG scores example 2:")
    questions = ["what is the ultimate purpose of the endpoint?"] * 2

    data1 = """
    Server-side, we will need to expose a prompt_schemas endpoint
    which provides the mapping of model name → prompt schema
    which we will use for rendering prompt input/settings/metadata on the client
    """

    data = [data1] * 2

    responses = ["""client rendering""", """metadata mapping"""]

    # f"{data1}. Query: {questions[0]}",
    # f"{data1}. Query: {questions[1]}",

    print(f"Input batch:")
    df = pd.DataFrame(
        {"question": questions, "data": data, "response": responses}
    )
    print(df)

    api_token = get_lastmile_api_token()

    result_dict = get_rag_eval_scores(
        questions,
        data,
        responses,
        api_token,
    )

    df["p_faithful"] = result_dict["p_faithful"]

    print(
        dedent(
            """
            Given a question and reference data (assumed to be factual),
            the faithfulness score estimates whether
            the response correctly answers the question according to the given data.
            """
        )
    )
    print("Dataframe with scores:")
    print(df)


if __name__ == "__main__":
    sys.exit(main())
```

## Example 2: General Text Evaluators Script

```python
"""The text module provides more general evaluation functions
for text generated by AI models."""

import sys

import dotenv

import lastmile_eval.text as lm_eval_text


def main():
    # Openai evaluators require openai API key in .env file.
    # See README.md for more information about `.env`.
    dotenv.load_dotenv()

    SUPPORTED_BACKING_LLMS = [
        "gpt-3.5-turbo",
        "gpt-4",
    ]

    print("Starting text evaluation examples.")

    for model_name in SUPPORTED_BACKING_LLMS:
        print(
            f"\n\n\n\nRunning example evaluators with backing LLM {model_name}"
        )
        text_scores_example_1(model_name)
        text_scores_example_2(model_name)
        text_scores_example_3(model_name)

    return 0


def text_scores_example_1(model_name: str):
    texts_to_evaluate = [
        "The quick brown fox jumps over the lazy dog.",
        "The quick brown fox jumps over the lazy dog.",
    ]
    references = [
        "The quick brown fox jumps over the lazy dog.",
        "The swift brown fox leaps over the lazy dog.",
    ]
    bleu = lm_eval_text.calculate_bleu_score(texts_to_evaluate, references)
    print("\n\nTexts to evaluate: ", texts_to_evaluate)
    print("References: ", references)
    print("\nBLEU scores: ", bleu)

    rouge1 = lm_eval_text.calculate_rouge1_score(texts_to_evaluate, references)
    print("\nROUGE1 scores: ", rouge1)

    exact_match = lm_eval_text.calculate_exact_match_score(
        texts_to_evaluate, references
    )

    print("\nExact match scores: ", exact_match)

    relevance = lm_eval_text.calculate_relevance_score(
        texts_to_evaluate, references, model_name=model_name
    )

    print("\nRelevance scores: ", relevance)

    summarization = lm_eval_text.calculate_summarization_score(
        texts_to_evaluate, references, model_name=model_name
    )

    print("\nSummarization scores: ", summarization)

    custom_semantic_similarity = (
        lm_eval_text.calculate_custom_llm_metric_example_semantic_similarity(
            texts_to_evaluate, references, model_name=model_name
        )
    )

    print("\nCustom semantic similarity scores: ", custom_semantic_similarity)


def text_scores_example_2(model_name: str):
    texts_to_evaluate = [
        "The quick brown fox jumps over the lazy dog.",
        "The quick brown fox jumps over the lazy dog.",
    ]
    references = [
        "The quick brown fox jumps over the lazy dog.",
        "The swift brown fox leaps over the lazy dog.",
    ]

    questions = ["What does the animal do", "Describe the fox"]

    qa = lm_eval_text.calculate_qa_score(
        texts_to_evaluate, references, questions, model_name=model_name
    )
    print("\n\nTexts to evaluate: ", texts_to_evaluate)
    print("References: ", references)
    print("\nQA scores: ", qa)

    human_vs_ai = lm_eval_text.calculate_human_vs_ai_score(
        texts_to_evaluate, references, questions, model_name=model_name
    )

    print("\nHuman vs AI scores: ", human_vs_ai)


def text_scores_example_3(model_name: str):
    texts_to_evaluate = [
        "I am happy",
        "I am sad",
    ]

    toxicity = lm_eval_text.calculate_toxicity_score(
        texts_to_evaluate, model_name=model_name
    )
    print("\nToxicity scores: ", toxicity)

    custom_sentiment = (
        lm_eval_text.calculate_custom_llm_metric_example_sentiment(
            texts_to_evaluate, model_name=model_name
        )
    )

    print("\nCustom sentiment scores: ", custom_sentiment)


if __name__ == "__main__":
    sys.exit(main())

```

            

Raw data

            {
    "_id": null,
    "home_page": null,
    "name": "lastmile-eval",
    "maintainer": null,
    "docs_url": null,
    "requires_python": ">=3.10",
    "maintainer_email": null,
    "keywords": null,
    "author": "LastMile AI",
    "author_email": null,
    "download_url": "https://files.pythonhosted.org/packages/67/38/d945030e9cc370a2460ad8212a43f8f3e6ffc920909bfd7aeab52c31b017/lastmile_eval-0.0.59.tar.gz",
    "platform": null,
    "description": "# LastMile AI Eval\n\nLibrary of tools to evaluate your RAG system.\n\n## Setup\n\n1. Get a LastMile API token (see section below)\n2. Install this library: `pip install lastmile-eval`\n3. Gather your data that needs evaluation\n4. Usage: see examples below.\n\n## LastMile API token\n\nTo get a LastMile AI token, please go to the [LastMile token's webpage](https://lastmileai.dev/settings?page=tokens).\nYou can create an account with Google or Github and then click the \"Create new token\" in the \"API Tokens\" section. Once a token is created, be sure to save it somewhere since you won't be able to see the value of it from the website again (though you can create a new one if that happens).\n\n**Please be careful not to share your token on GitHub**. Instead we recommend saving it under your project\u2019s (or home directory) `.env` file as: `LASTMILE_API_TOKEN=<TOKEN_HERE>`, and use `dotenv.load_dotenv()` instead. See the `examples/` folder for how to do this.\n\n## LLM Provider Tokens (`.env` file)\n\nIn order to use LLM-based evaluators, add your other API tokens to your .env file.\nExample: `OPENAI_API_KEY=<TOKEN_HERE>`\n\n# Examples\n\n## Example 1: RAG Evaluation Script\n\n```python\n\"\"\"The RAG evaluation API runs evaluation criteria (ex: faithfulness)\n    of generative AI outputs from a RAG system.\n\nParticularly, we evaluate based on this triplet of information:\n\n1. User query\n2. Data that goes into the LLM\n3. LLM's output response\n\nThe  `get_rag_eval_scores()` function returns a faithfulness score from 0 to 1.\n\"\"\"\n\nimport os\nimport sys\nfrom textwrap import dedent\n\nimport dotenv\nimport pandas as pd\n\nfrom lastmile_eval.rag import get_rag_eval_scores\n\n\ndef main():\n    rag_scores_example_1()\n    rag_scores_example_2()\n    return 0\n\n\ndef get_lastmile_api_token():\n    \"\"\"See README.md for mor information.\"\"\"\n    dotenv.load_dotenv()\n    api_token = os.getenv(\"LASTMILE_API_TOKEN\")\n    assert api_token is not None\n    return api_token\n\n\ndef rag_scores_example_1():\n    print(\"\\n\\nRAG scores example 1:\")\n    statement1 = \"the sky is red\"\n    statement2 = \"the sky is blue\"\n\n    queries = [\"what color is the sky?\", \"is the sky blue?\"]\n    data = [statement1, statement1]\n    responses = [statement1, statement2]\n    api_token = get_lastmile_api_token()\n    result = get_rag_eval_scores(\n        queries,\n        data,\n        responses,\n        api_token,\n    )\n\n    print(\"Result: \", result)\n\n    # result will look something like:\n    # {'p_faithful': [0.9955534338951111, 6.857347034383565e-05]}\n\n\ndef rag_scores_example_2():\n    print(\"\\n\\nRAG scores example 2:\")\n    questions = [\"what is the ultimate purpose of the endpoint?\"] * 2\n\n    data1 = \"\"\"\n    Server-side, we will need to expose a prompt_schemas endpoint\n    which provides the mapping of model name \u2192 prompt schema\n    which we will use for rendering prompt input/settings/metadata on the client\n    \"\"\"\n\n    data = [data1] * 2\n\n    responses = [\"\"\"client rendering\"\"\", \"\"\"metadata mapping\"\"\"]\n\n    # f\"{data1}. Query: {questions[0]}\",\n    # f\"{data1}. Query: {questions[1]}\",\n\n    print(f\"Input batch:\")\n    df = pd.DataFrame(\n        {\"question\": questions, \"data\": data, \"response\": responses}\n    )\n    print(df)\n\n    api_token = get_lastmile_api_token()\n\n    result_dict = get_rag_eval_scores(\n        questions,\n        data,\n        responses,\n        api_token,\n    )\n\n    df[\"p_faithful\"] = result_dict[\"p_faithful\"]\n\n    print(\n        dedent(\n            \"\"\"\n            Given a question and reference data (assumed to be factual),\n            the faithfulness score estimates whether\n            the response correctly answers the question according to the given data.\n            \"\"\"\n        )\n    )\n    print(\"Dataframe with scores:\")\n    print(df)\n\n\nif __name__ == \"__main__\":\n    sys.exit(main())\n```\n\n## Example 2: General Text Evaluators Script\n\n```python\n\"\"\"The text module provides more general evaluation functions\nfor text generated by AI models.\"\"\"\n\nimport sys\n\nimport dotenv\n\nimport lastmile_eval.text as lm_eval_text\n\n\ndef main():\n    # Openai evaluators require openai API key in .env file.\n    # See README.md for more information about `.env`.\n    dotenv.load_dotenv()\n\n    SUPPORTED_BACKING_LLMS = [\n        \"gpt-3.5-turbo\",\n        \"gpt-4\",\n    ]\n\n    print(\"Starting text evaluation examples.\")\n\n    for model_name in SUPPORTED_BACKING_LLMS:\n        print(\n            f\"\\n\\n\\n\\nRunning example evaluators with backing LLM {model_name}\"\n        )\n        text_scores_example_1(model_name)\n        text_scores_example_2(model_name)\n        text_scores_example_3(model_name)\n\n    return 0\n\n\ndef text_scores_example_1(model_name: str):\n    texts_to_evaluate = [\n        \"The quick brown fox jumps over the lazy dog.\",\n        \"The quick brown fox jumps over the lazy dog.\",\n    ]\n    references = [\n        \"The quick brown fox jumps over the lazy dog.\",\n        \"The swift brown fox leaps over the lazy dog.\",\n    ]\n    bleu = lm_eval_text.calculate_bleu_score(texts_to_evaluate, references)\n    print(\"\\n\\nTexts to evaluate: \", texts_to_evaluate)\n    print(\"References: \", references)\n    print(\"\\nBLEU scores: \", bleu)\n\n    rouge1 = lm_eval_text.calculate_rouge1_score(texts_to_evaluate, references)\n    print(\"\\nROUGE1 scores: \", rouge1)\n\n    exact_match = lm_eval_text.calculate_exact_match_score(\n        texts_to_evaluate, references\n    )\n\n    print(\"\\nExact match scores: \", exact_match)\n\n    relevance = lm_eval_text.calculate_relevance_score(\n        texts_to_evaluate, references, model_name=model_name\n    )\n\n    print(\"\\nRelevance scores: \", relevance)\n\n    summarization = lm_eval_text.calculate_summarization_score(\n        texts_to_evaluate, references, model_name=model_name\n    )\n\n    print(\"\\nSummarization scores: \", summarization)\n\n    custom_semantic_similarity = (\n        lm_eval_text.calculate_custom_llm_metric_example_semantic_similarity(\n            texts_to_evaluate, references, model_name=model_name\n        )\n    )\n\n    print(\"\\nCustom semantic similarity scores: \", custom_semantic_similarity)\n\n\ndef text_scores_example_2(model_name: str):\n    texts_to_evaluate = [\n        \"The quick brown fox jumps over the lazy dog.\",\n        \"The quick brown fox jumps over the lazy dog.\",\n    ]\n    references = [\n        \"The quick brown fox jumps over the lazy dog.\",\n        \"The swift brown fox leaps over the lazy dog.\",\n    ]\n\n    questions = [\"What does the animal do\", \"Describe the fox\"]\n\n    qa = lm_eval_text.calculate_qa_score(\n        texts_to_evaluate, references, questions, model_name=model_name\n    )\n    print(\"\\n\\nTexts to evaluate: \", texts_to_evaluate)\n    print(\"References: \", references)\n    print(\"\\nQA scores: \", qa)\n\n    human_vs_ai = lm_eval_text.calculate_human_vs_ai_score(\n        texts_to_evaluate, references, questions, model_name=model_name\n    )\n\n    print(\"\\nHuman vs AI scores: \", human_vs_ai)\n\n\ndef text_scores_example_3(model_name: str):\n    texts_to_evaluate = [\n        \"I am happy\",\n        \"I am sad\",\n    ]\n\n    toxicity = lm_eval_text.calculate_toxicity_score(\n        texts_to_evaluate, model_name=model_name\n    )\n    print(\"\\nToxicity scores: \", toxicity)\n\n    custom_sentiment = (\n        lm_eval_text.calculate_custom_llm_metric_example_sentiment(\n            texts_to_evaluate, model_name=model_name\n        )\n    )\n\n    print(\"\\nCustom sentiment scores: \", custom_sentiment)\n\n\nif __name__ == \"__main__\":\n    sys.exit(main())\n\n```\n",
    "bugtrack_url": null,
    "license": null,
    "summary": "An API to measure evaluation criteria (ex: faithfulness) of generative AI outputs",
    "version": "0.0.59",
    "project_urls": {
        "Bug Tracker": "https://github.com/lastmile-ai/eval/issues",
        "Homepage": "https://github.com/lastmile-ai/eval"
    },
    "split_keywords": [],
    "urls": [
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "fb745595fe37c9952805889fea432b58f067dc51bbd2b100a16abdcdfb02cda3",
                "md5": "7998cae31c0623e8e5dd9596a5c51bb6",
                "sha256": "5a9cd0d954bc727aa59e591951d9ac5a1d3a54e6b838bfc3a5f062ef34d29b4a"
            },
            "downloads": -1,
            "filename": "lastmile_eval-0.0.59-py3-none-any.whl",
            "has_sig": false,
            "md5_digest": "7998cae31c0623e8e5dd9596a5c51bb6",
            "packagetype": "bdist_wheel",
            "python_version": "py3",
            "requires_python": ">=3.10",
            "size": 2487708,
            "upload_time": "2024-06-12T21:54:04",
            "upload_time_iso_8601": "2024-06-12T21:54:04.537822Z",
            "url": "https://files.pythonhosted.org/packages/fb/74/5595fe37c9952805889fea432b58f067dc51bbd2b100a16abdcdfb02cda3/lastmile_eval-0.0.59-py3-none-any.whl",
            "yanked": false,
            "yanked_reason": null
        },
        {
            "comment_text": "",
            "digests": {
                "blake2b_256": "6738d945030e9cc370a2460ad8212a43f8f3e6ffc920909bfd7aeab52c31b017",
                "md5": "07531b6f0616eed608468b2be41e8bf9",
                "sha256": "106dfd1ae69e7478dc5fd15269cd8a6a7c8035ccd5998fe3b722032426e971f2"
            },
            "downloads": -1,
            "filename": "lastmile_eval-0.0.59.tar.gz",
            "has_sig": false,
            "md5_digest": "07531b6f0616eed608468b2be41e8bf9",
            "packagetype": "sdist",
            "python_version": "source",
            "requires_python": ">=3.10",
            "size": 2456432,
            "upload_time": "2024-06-12T21:54:07",
            "upload_time_iso_8601": "2024-06-12T21:54:07.140204Z",
            "url": "https://files.pythonhosted.org/packages/67/38/d945030e9cc370a2460ad8212a43f8f3e6ffc920909bfd7aeab52c31b017/lastmile_eval-0.0.59.tar.gz",
            "yanked": false,
            "yanked_reason": null
        }
    ],
    "upload_time": "2024-06-12 21:54:07",
    "github": true,
    "gitlab": false,
    "bitbucket": false,
    "codeberg": false,
    "github_user": "lastmile-ai",
    "github_project": "eval",
    "github_not_found": true,
    "lcname": "lastmile-eval"
}
        
Elapsed time: 0.31675s