文本分割器 API

TextSplitter 与文本分割

概述

文本分割器将长文本分割为较小的块,以便于嵌入和检索。LangChain 提供了多种分割策略。

graph TD
                    A[TextSplitter] --> B[CharacterTextSplitter]
                    A --> C[RecursiveCharacterTextSplitter]
                    A --> D[SemanticTextSplitter]

                    A --> E[代码分割]
                    E --> F[PythonCodeSplitter]
                    E --> G[JavaScriptSplitter]

                    A --> H[特殊格式]
                    H --> I[MarkdownTextSplitter]
                    H --> J[LatexTextSplitter]

                    style A fill:#e1f5fe
                    style C fill:#c8e6c9

基类

TextSplitter

所有文本分割器的抽象基类。

from langchain_text_splitters import TextSplitter

class TextSplitter(ABC):
    """文本分割器基类"""

    def split_text(self, text: str) -> List[str]:
        """
        分割文本

        Args:
            text: 要分割的文本

        Returns:
            分割后的文本块列表
        """

    def split_documents(self, documents: List[Document]) -> List[Document]:
        """
        分割文档列表

        Args:
            documents: Document 列表

        Returns:
            分割后的 Document 列表
        """

    @staticmethod
    def _join_docs(docs: List[str], separator: str) -> str:
        """合并文档块"""

    def create_documents(
        self,
        texts: List[str],
        metadatas: Optional[List[dict]] = None,
    ) -> List[Document]:
        """从文本创建文档"""

    def split_text_to_chunks(self, text: str) -> Iterator[str]:
        """生成文本块(迭代器)"""

常用分割器

RecursiveCharacterTextSplitter

递归字符分割器(推荐)。

from langchain_text_splitters import RecursiveCharacterTextSplitter

class RecursiveCharacterTextSplitter(TextSplitter):
    """递归字符分割器"""

    def __init__(
        self,
        separators: Optional[List[str]] = None,
        keep_separator: bool = True,
        chunk_size: int = 4000,
        chunk_overlap: int = 200,
        length_function: Callable[[str], int] = len,
        strip_whitespace: bool = True,
    ):
        """
        初始化

        Args:
            separators: 分隔符列表(按优先级)
                默认: ["\n\n", "\n", " ", ""]
            keep_separator: 是否在结果中保留分隔符
            chunk_size: 块大小(字符数)
            chunk_overlap: 块之间的重叠字符数
            length_function: 计算长度的函数
            strip_whitespace: 是否去除空白
        """

    def split_text(self, text: str) -> List[str]:
        """
        递归分割文本

        优先使用前一个分隔符分割,
        如果块仍太大,则使用下一个分隔符
        """

使用示例

python
from langchain_text_splitters import RecursiveCharacterTextSplitter

# 基础使用
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

chunks = splitter.split_text(large_text)

# 自定义分隔符
splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n\n", "\n\n", "\n", "。", "!", "?", " ", ""],
    chunk_size=500,
    chunk_overlap=50
)

# 分割文档
from langchain_core.documents import Document

documents = splitter.split_documents([
    Document(page_content=large_text)
])

# 使用 token 计数
import tiktoken

encoding = tiktoken.get_encoding("cl100k_base")

def token_count(text: str) -> int:
    return len(encoding.encode(text))

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # 1000 tokens
    chunk_overlap=200,
    length_function=token_count
)

CharacterTextSplitter

简单字符分割器。

from langchain_text_splitters import CharacterTextSplitter

class CharacterTextSplitter(TextSplitter):
    """字符分割器"""

    def __init__(
        self,
        separator: str = "\n\n",
        keep_separator: bool = False,
        chunk_size: int = 4000,
        chunk_overlap: int = 200,
        length_function: Callable[[str], int] = len,
        strip_whitespace: bool = True,
    ):
        """
        初始化

        Args:
            separator: 分隔符(仅使用一个)
            keep_separator: 是否保留分隔符
            chunk_size: 块大小
            chunk_overlap: 重叠大小
            length_function: 长度计算函数
            strip_whitespace: 是否去除空白
        """

    def split_text(self, text: str) -> List[str]:
        """
        按分隔符分割文本
        """

使用示例

python
from langchain_text_splitters import CharacterTextSplitter

# 按段落分割
splitter = CharacterTextSplitter(
    separator="\n\n",
    chunk_size=1000,
    chunk_overlap=100
)

chunks = splitter.split_text(text)

# 按句子分割
splitter = CharacterTextSplitter(
    separator="。",
    chunk_size=500,
    chunk_overlap=50,
    keep_separator=True
)

SemanticTextSplitter

语义分割器(基于嵌入相似度)。

from langchain_experimental.text_splitter import SemanticTextSplitter

class SemanticTextSplitter(TextSplitter):
    """语义分割器"""

    def __init__(
        self,
        embeddings: Embeddings,
        breakpoint_threshold: float = 0.5,
        buffer_size: int = 1,
        chunk_size: int = 1000,
    ):
        """
        初始化

        Args:
            embeddings: 嵌入模型
            breakpoint_threshold: 分割阈值(0-1)
            buffer_size: 缓冲区大小
            chunk_size: 目标块大小
        """

    def split_text(self, text: str) -> List[str]:
        """
        基于语义相似度分割
        在语义变化大的位置分割
        """

使用示例

python
from langchain_experimental.text_splitter import SemanticTextSplitter
from langchain_openai import OpenAIEmbeddings

splitter = SemanticTextSplitter(
    embeddings=OpenAIEmbeddings(),
    breakpoint_threshold=0.6,
    chunk_size=1000
)

chunks = splitter.split_text(long_article)

代码分割器

PythonCodeSplitter

from langchain_text_splitters import PythonCodeSplitter

splitter = PythonCodeSplitter(
    chunk_size=1000,
    chunk_overlap=100
)

# 按函数、类定义分割
chunks = splitter.split_text(python_code)

JavaScriptSplitter

from langchain_text_splitters import JavaScriptSplitter

splitter = JavaScriptSplitter(
    chunk_size=1000,
    chunk_overlap=100
)

chunks = splitter.split_text(js_code)

MarkdownTextSplitter

Markdown 文本分割器。

from langchain_text_splitters import MarkdownTextSplitter

class MarkdownTextSplitter(TextSplitter):
    """Markdown 分割器"""

    def __init__(
        self,
        headers_to_split_on: List[Tuple[str, str]] = [
            ("#", "Header 1"),
            ("##", "Header 2"),
            ("###", "Header 3"),
        ],
        chunk_size: int = 4000,
        chunk_overlap: int = 200,
    ):
        """
        初始化

        Args:
            headers_to_split_on: 要分割的标题级别
            chunk_size: 块大小
            chunk_overlap: 重叠大小
        """

使用示例

python
from langchain_text_splitters import MarkdownTextSplitter

splitter = MarkdownTextSplitter(
    headers_to_split_on=[
        ("#", "Header 1"),
        ("##", "Header 2"),
        ("###", "Header 3"),
    ],
    chunk_size=2000,
    chunk_overlap=200
)

chunks = splitter.split_text(markdown_content)

# 带元数据的分割
from langchain_text_splitters import MarkdownHeaderTextSplitter

splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=[
        ("#", "Header 1"),
        ("##", "Header 2"),
    ]
)

docs = splitter.split_text(markdown_content)
# 每个 Document 包含对应级别的标题作为 metadata

LatexTextSplitter

LaTeX 文本分割器。

from langchain_text_splitters import LatexTextSplitter

splitter = LatexTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)

chunks = splitter.split_text(latex_content)

NLTKTextSplitter

使用 NLTK 进行句子分割。

from langchain_text_splitters import NLTKTextSplitter

splitter = NLTKTextSplitter(
    chunk_size=1000,
    chunk_overlap=100
)

chunks = splitter.split_text(text)

SpacyTextSplitter

使用 spaCy 进行句子分割。

from langchain_text_splitters import SpacyTextSplitter

splitter = SpacyTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    separator=" "  # 使用 spaCy 的句子边界
)

chunks = splitter.split_text(text)

使用示例

python
# ========== 示例1: 递归分割 ==========
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
)

chunks = splitter.split_text(large_text)
print(f"分割成 {len(chunks)} 块")

# ========== 示例2: 分割文档列表 ==========
from langchain_core.documents import Document

documents = [
    Document(page_content="文档1内容..."),
    Document(page_content="文档2内容...")
]

split_docs = splitter.split_documents(documents)

# ========== 示例3: 自定义分隔符 ==========
splitter = RecursiveCharacterTextSplitter(
    separators=["\n\n\n", "\n\n", "\n", "。", "!", "?", " ", ""],
    chunk_size=500,
    chunk_overlap=50,
    keep_separator=True
)

# ========== 示例4: 使用 token 计数 ==========
import tiktoken

encoding = tiktoken.encoding_for_model("gpt-4")

def token_count(text: str) -> int:
    return len(encoding.encode(text))

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # tokens
    chunk_overlap=200,
    length_function=token_count
)

# ========== 示例5: 添加元数据 ==========
from langchain_core.documents import Document
from typing import Dict, List

class MetadataSplitter(RecursiveCharacterTextSplitter):
    def split_text(self, text: str, metadata: Dict = None) -> List[Document]:
        chunks = super().split_text(text)
        return [
            Document(
                page_content=chunk,
                metadata=metadata or {}
            )
            for chunk in chunks
        ]

splitter = MetadataSplitter(chunk_size=1000, chunk_overlap=200)
docs = splitter.split_text(text, metadata={"source": "document.txt"})

# ========== 示例6: HTML 内容分割 ==========
from langchain_text_splitters import HTMLHeaderTextSplitter

html_splitter = HTMLHeaderTextSplitter(
    headers_to_split_on=[
        ("h1", "Header 1"),
        ("h2", "Header 2"),
        ("h3", "Header 3"),
    ]
)

docs = html_splitter.split_text(html_content)

# ========== 示例7: 递归分割 HTML ==========
from langchain_text_splitters import RecursiveCharacterTextSplitter

html_splitter = HTMLHeaderTextSplitter()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)

# 先按 HTML 结构分割,再按字符分割
html_docs = html_splitter.split_text(html_content)
final_docs = text_splitter.split_documents(html_docs)

# ========== 示例8: 流式分割大文件 ==========

def split_large_file(file_path: str, chunk_size: int = 1000):
    """流式分割大文件"""
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=200
    )

    with open(file_path, 'r') as f:
        content = f.read()

    for i, chunk in enumerate(splitter.split_text(content)):
        yield {
            "index": i,
            "content": chunk,
            "metadata": {"source": file_path, "chunk": i}
        }

# 使用
for chunk_data in split_large_file("large.txt"):
    process_chunk(chunk_data)

参数建议

场景 chunk_size chunk_overlap 分隔符 通用文本 1000-2000 200 默认 代码 500-1000 50-100 代码相关 长文档 2000-4000 400 段落、句子 中文文本 500-1000 100 句子、标点

相关 API