文档转换器 API

DocumentTransformer 与文档处理

概述

文档转换器对加载的文档进行转换,如去重、压缩、元数据提取等操作。

graph TD
                    A[DocumentTransformer] --> B[Html2TextTransformer]
    A --> C[BeautifulSoupTransformer]
    A --> D[DoctypeMetadataTransformer]
    A --> E[EmbeddingsRedundantFilter]

    A --> F[嵌入相关]
    F --> G[EmbeddingsClusteringFilter]
    F --> H[EmbeddingsMaxMarginalRelevanceExtraFilter]

    style A fill:#e1f5fe
    style E fill:#c8e6c9

基类

DocumentTransformer

文档转换器抽象基类。

from langchain_core.documents.transformers import DocumentTransformer

class DocumentTransformer(ABC):
    """文档转换器基类"""

    @abstractmethod
    def transform_documents(
        self,
        documents: Sequence[Document],
        **kwargs: Any,
    ) -> Sequence[Document]:
        """
        转换文档列表

        Args:
            documents: 文档列表
            **kwargs: 额外参数

        Returns:
            转换后的文档列表
        """

    async def atransform_documents(
        self,
        documents: Sequence[Document],
        **kwargs: Any,
    ) -> Sequence[Document]:
        """
        异步转换文档列表

        默认调用 transform_documents
        """
        return await asyncio.to_thread(
            self.transform_documents, documents, **kwargs
        )

转换器类型

Html2TextTransformer

HTML 转纯文本。

from langchain_community.document_transformers import Html2TextTransformer

class Html2TextTransformer(DocumentTransformer):
    """HTML 转文本转换器"""

    def __init__(
        self,
        ignore_links: bool = False,
        ignore_images: bool = False,
        body_width: int = 0,
        ignore_emphasis: bool = False,
    ):
        """
        初始化

        Args:
            ignore_links: 是否忽略链接
            ignore_images: 是否忽略图片
            body_width: 行宽限制(0=不限制)
            ignore_emphasis: 是否忽略强调(粗体/斜体)
        """

使用示例

python
from langchain_community.document_transformers import Html2TextTransformer
from langchain_core.documents import Document

transformer = Html2TextTransformer()

html_content = """

    
        

标题

这是段落内容

""" documents = [Document(page_content=html_content)] transformed = transformer.transform_documents(documents) print(transformed[0].page_content) # "标题\n\n这是段落内容"

BeautifulSoupTransformer

BeautifulSoup HTML 转换器。

from langchain_community.document_transformers import BeautifulSoupTransformer

class BeautifulSoupTransformer(DocumentTransformer):
    """BeautifulSoup 转换器"""

    def __init__(
        self,
        tags_to_extract: List[str] = ["p"],
        strip_tags: Optional[List[str]] = None,
        remove_lines: bool = False,
    ):
        """
        初始化

        Args:
            tags_to_extract: 要提取的标签
            strip_tags: 要移除的标签
            remove_lines: 是否移除空行
        """

使用示例

python
from langchain_community.document_transformers import BeautifulSoupTransformer

# 提取特定标签
transformer = BeautifulSoupTransformer(
    tags_to_extract=["h1", "h2", "p"]
)

html = """

主标题

段落1

段落2

""" docs = [Document(page_content=html)] transformed = transformer.transform_documents(docs)

DoctypeMetadataTransformer

添加文档类型元数据。

from langchain_community.document_transformers import DoctypeMetadataTransformer

class DoctypeMetadataTransformer(DocumentTransformer):
    """文档类型元数据转换器"""

    def __init__(
        self,
        doctype: str = "",
    ):
        """
        初始化

        Args:
            doctype: 文档类型标识
        """

使用示例

python
from langchain_community.document_transformers import DoctypeMetadataTransformer

transformer = DoctypeMetadataTransformer(doctype="pdf")

docs = [Document(page_content="内容")]
transformed = transformer.transform_documents(docs)

print(transformed[0].metadata)
# {"doctype": "pdf"}

EmbeddingsRedundantFilter

基于嵌入的去重过滤器。

from langchain_community.document_transformers import EmbeddingsRedundantFilter

class EmbeddingsRedundantFilter(DocumentTransformer):
    """嵌入去重过滤器"""

    def __init__(
        self,
        embeddings: Embeddings,
        similarity_threshold: float = 0.95,
    ):
        """
        初始化

        Args:
            embeddings: 嵌入模型
            similarity_threshold: 相似度阈值(高于此值视为重复)
        """

使用示例

python
from langchain_community.document_transformers import EmbeddingsRedundantFilter
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

transformer = EmbeddingsRedundantFilter(
    embeddings=embeddings,
    similarity_threshold=0.9  # 90% 相似度视为重复
)

docs = [
    Document(page_content="Python 是一种编程语言"),
    Document(page_content="Python 是编程语言"),  # 重复
    Document(page_content="Java 也是一种编程语言")
]

unique_docs = transformer.transform_documents(docs)
print(f"原始: {len(docs)}, 去重后: {len(unique_docs)}")
# 3 -> 2

EmbeddingsClusteringFilter

基于嵌入聚类的选择器。

from langchain_community.document_transformers import EmbeddingsClusteringFilter

class EmbeddingsClusteringFilter(DocumentTransformer):
    """嵌入聚类过滤器"""

    def __init__(
        self,
        embeddings: Embeddings,
        num_clusters: int = 5,
        num_closest: int = 1,
    ):
        """
        初始化

        Args:
            embeddings: 嵌入模型
            num_clusters: 聚类数量
            num_closest: 每个聚类选择的最接近数量
        """

使用示例

python
from langchain_community.document_transformers import EmbeddingsClusteringFilter
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

transformer = EmbeddingsClusteringFilter(
    embeddings=embeddings,
    num_clusters=3,      # 分成 3 组
    num_closest=2        # 每组选 2 个
)

# 从大量文档中选择代表性样本
docs = [Document(page_content=f"文档 {i}") for i in range(100)]
selected = transformer.transform_documents(docs)
print(f"选择: {len(selected)}")  # 6 (3 * 2)

LongContextReorder

长上下文重排序(优化检索结果)。

from langchain_community.document_transformers import LongContextReorder

class LongContextReorder(DocumentTransformer):
    """长上下文重排序

    将最相关的文档放在开头和结尾,
    不太相关的放在中间(利用 LLM 的注意力模式)
    """

使用示例

python
from langchain_community.document_transformers import LongContextReorder

reorder = LongContextReorder()

docs = [
    Document(page_content="相关文档 1", metadata={"score": 0.95}),
    Document(page_content="相关文档 2", metadata={"score": 0.90}),
    Document(page_content="不太相关", metadata={"score": 0.50}),
    Document(page_content="最相关", metadata={"score": 0.99})
]

# 重排序后:最相关在前,次相关在后
reordered = reorder.transform_documents(docs)

NMFITextSplitter

非负矩阵分解文本分割器。

from langchain_experimental.text_splitter import NMFITextSplitter

class NMFITextSplitter(DocumentTransformer):
    """NMI 文本分割器(用于文档分块)"""

使用示例

python
# ========== 示例1: 链式转换 ==========
from langchain_community.document_transformers import (
    Html2TextTransformer,
    DoctypeMetadataTransformer
)

# 转换管道
transformers = [
    Html2TextTransformer(),
    DoctypeMetadataTransformer(doctype="html")
]

docs = [Document(page_content="...")]

for transformer in transformers:
    docs = transformer.transform_documents(docs)

# ========== 示例2: 与加载器集成 ==========
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_transformers import Html2TextTransformer

# 加载并转换
loader = WebBaseLoader("https://example.com")
docs = loader.load()

transformer = Html2TextTransformer()
clean_docs = transformer.transform_documents(docs)

# ========== 示例3: 去重管道 ==========
from langchain_community.document_transformers import (
    Html2TextTransformer,
    EmbeddingsRedundantFilter
)
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()

pipeline = [
    Html2TextTransformer(),
    EmbeddingsRedundantFilter(embeddings, similarity_threshold=0.9)
]

# ========== 示例4: 自定义转换器 ==========
from langchain_core.documents.transformers import DocumentTransformer

class UppercaseTransformer(DocumentTransformer):
    def transform_documents(self, documents):
        for doc in documents:
            doc.page_content = doc.page_content.upper()
        return documents

transformer = UppercaseTransformer()
docs = transformer.transform_documents([
    Document(page_content="hello world")
])
# "HELLO WORLD"

相关 API