评估器 API

评估框架与质量指标

概述

评估器用于量化 LLM 应用的输出质量,支持正确性、相关性、连贯性等多维度评估。

graph TD
    A[Evaluators] --> B[StringEvaluator]
    A --> C[PairwiseStringEvaluator]
    A --> D[EmbeddingEvaluator]

    B --> E[准确率]
    B --> F[相关性]

    C --> G[成对比较]

    D --> H[语义相似度]

    A --> I[load_evaluator]
    I --> J[labeled_criteria]
    I --> K[criteria]
    I --> L[labeled_pairwise_string]

    style A fill:#e1f5fe
    style I fill:#c8e6c9

字符串评估器

StringEvaluator

字符串评估器基类。

from langchain_core.evaluators import StringEvaluator

class StringEvaluator(ABC):
    """字符串评估器基类"""

    @property
    @abstractmethod
    def evaluation_name(self) -> str:
        """返回评估器名称"""

    @abstractmethod
    def evaluate_strings(
        self,
        *,
        prediction: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        """
        评估字符串输出

        Args:
            prediction: 模型预测输出
            reference: 参考答案(可选)
            input: 输入(可选)
            **kwargs: 额外参数

        Returns:
            评估结果字典,通常包含:
            - score: 分数(0-1 或其他范围)
            - reasoning: 评估理由
            - 其他自定义字段
        """

    async def aevaluate_strings(
        self,
        prediction: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        """异步评估"""

    def evaluate_strings_optimized(
        self,
        prediction: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
    ) -> dict:
        """优化版评估(批量处理)"""

PairwiseStringEvaluator

成对比较评估器。

from langchain_core.evaluators import PairwiseStringEvaluator

class PairwiseStringEvaluator(ABC):
    """成对字符串评估器"""

    @abstractmethod
    def evaluate_string_pairs(
        self,
        *,
        prediction: str,
        prediction_b: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
        **kwargs: Any,
    ) -> dict:
        """
        评估两个输出

        Args:
            prediction: 输出 A
            prediction_b: 输出 B
            reference: 参考答案(可选)
            input: 输入(可选)

        Returns:
            评估结果:
            - score: 0 (A更好), 1 (B更好), 或 None (平局)
            - reasoning: 评估理由
        """

加载评估器

load_evaluator

加载内置评估器。

from langchain_core.evaluators import load_evaluator

def load_evaluator(
    evaluator: Union[
        EvaluatorsKeys,
        str,
    ],
    *,
    llm: Optional[BaseLanguageModel] = None,
    client: Optional[Client] = None,
    **kwargs: Any,
) -> Union[StringEvaluator, PairwiseStringEvaluator]:
    """
    加载评估器

    Args:
        evaluator: 评估器标识
            - "criteria": 自定义标准评估
            - "labeled_criteria": 带标签的标准评估
            - "labeled_pairwise_string": 成对比较评估
            - "embedding_distance": 嵌入距离评估
            - "string_distance": 字符串距离评估
        llm: 用于评估的 LLM
        client: LangSmith 客户端
        **kwargs: 额外参数

    Returns:
        评估器实例
    """

可用评估器

evaluator 说明
criteria 自定义标准
labeled_criteria 带标签的标准
labeled_pairwise_string 成对比较
embedding_distance 嵌入距离
string_distance 字符串距离
qa QA 评估
cot_qa 思维链 QA

使用示例

python
from langchain_core.evaluators import load_evaluator
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4o", temperature=0)

# 加载标准评估器
evaluator = load_evaluator(
    "labeled_criteria",
    criteria="correctness",
    llm=llm
)

result = evaluator.evaluate_strings(
    prediction="巴黎是法国的首都",
    reference="巴黎是法国的首都",
    input="法国的首都是哪里?"
)

print(f"分数: {result['score']}")
print(f"理由: {result['reasoning']}")

自定义标准评估

from langchain_core.evaluators import load_evaluator

# 自定义标准
custom_criteria = {
    "helpfulness": "输出是否对用户有帮助?",
    "clarity": "输出是否清晰明了?",
    "accuracy": "输出是否准确无误?",
    "relevance": "输出是否与输入相关?"
}

evaluator = load_evaluator(
    "criteria",
    criteria=custom_criteria,
    llm=ChatOpenAI(model="gpt-4o")
)

result = evaluator.evaluate_strings(
    prediction="Python 是一种高级编程语言...",
    input="什么是 Python?"
)

成对比较评估

from langchain_core.evaluators import load_evaluator

evaluator = load_evaluator(
    "labeled_pairwise_string",
    criteria="helpfulness",
    llm=ChatOpenAI(model="gpt-4o")
)

result = evaluator.evaluate_string_pairs(
    prediction="Python 是一种语言",
    prediction_b="Python 是一种由 Guido van Rossum 创建的高级编程语言",
    input="Python 是什么?",
    reference="Python 是一种编程语言"
)

print(f"胜出: {result['score']}")  # A 或 B
print(f"理由: {result['reasoning']}")

嵌入距离评估

from langchain_core.evaluators import load_evaluator
from langchain_openai import OpenAIEmbeddings

evaluator = load_evaluator(
    "embedding_distance",
    distance_metric="cosine",  # cosine, euclidean, manhattan
    embeddings=OpenAIEmbeddings()
)

result = evaluator.evaluate_strings(
    prediction="相似的内容",
    reference="相似的内容"
)

print(f"距离分数: {result['score']}")  # 距离越小分数越高

使用示例

python
# ========== 示例1: 自定义评估器 ==========
from langchain_core.evaluators import StringEvaluator
from typing import Optional

class ExactMatchEvaluator(StringEvaluator):
    """精确匹配评估器"""

    @property
    def evaluation_name(self) -> str:
        return "exact_match"

    def evaluate_strings(
        self,
        *,
        prediction: str,
        reference: Optional[str] = None,
        input: Optional[str] = None,
    ) -> dict:
        score = 1 if prediction.strip() == reference.strip() else 0
        return {
            "score": score,
            "reasoning": f"预测: {prediction} | 参考: {reference}"
        }

evaluator = ExactMatchEvaluator()
result = evaluator.evaluate_strings(
    prediction="Hello World",
    reference="Hello World"
)
# {"score": 1, "reasoning": "..."}

# ========== 示例2: 批量评估 ==========
test_cases = [
    {"input": "2+2=?", "reference": "4"},
    {"input": "法国首都?", "reference": "巴黎"},
    {"input": "Python?", "reference": "编程语言"}
]

for case in test_cases:
    prediction = llm.invoke(case["input"])
    result = evaluator.evaluate_strings(
        prediction=prediction.content,
        reference=case["reference"],
        input=case["input"]
    )
    print(f"输入: {case['input']}, 分数: {result['score']}")

# ========== 示例3: 评估链 ==========
from langchain_core.runnables import RunnableLambda
from langchain_core.prompts import ChatPromptTemplate

# 创建评估链
eval_prompt = ChatPromptTemplate.from_template("""
评估以下回答的质量:

问题: {input}
回答: {prediction}

请评分 (0-1): {grading_prompt}
""")

eval_chain = eval_prompt | llm | StrOutputParser()

# ========== 示例4: RAG 评估 (RAGAS) ==========
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision
)

test_data = {
    "question": ["什么是 LangChain?"],
    "answer": ["LangChain 是一个框架"],
    "contexts": [["LangChain 是..."]],  # 检索到的上下文
    "ground_truth": ["LangChain 是 LLM 应用框架"]
}

result = evaluate(
    test_data,
    metrics=[faithfulness, answer_relevancy, context_recall, context_precision]
)

print(result.to_pandas())

# ========== 示例5: A/B 测试评估 ==========
evaluator = load_evaluator("labeled_pairwise_string", llm=llm)

# 两个版本
results_v1 = chain_v1.batch(inputs)
results_v2 = chain_v2.batch(inputs)

for i, (r1, r2) in enumerate(zip(results_v1, results_v2)):
    comparison = evaluator.evaluate_string_pairs(
        prediction=r1.content,
        prediction_b=r2.content,
        input=inputs[i]
    )
    print(f"输入 {i}: {comparison['score']} (A更好=0, B更好=1)")

相关 API