文本分割器 API
TextSplitter 与文本分割
概述
文本分割器将长文本分割为较小的块,以便于嵌入和检索。LangChain 提供了多种分割策略。
graph TD
A[TextSplitter] --> B[CharacterTextSplitter]
A --> C[RecursiveCharacterTextSplitter]
A --> D[SemanticTextSplitter]
A --> E[代码分割]
E --> F[PythonCodeSplitter]
E --> G[JavaScriptSplitter]
A --> H[特殊格式]
H --> I[MarkdownTextSplitter]
H --> J[LatexTextSplitter]
style A fill:#e1f5fe
style C fill:#c8e6c9
基类
TextSplitter
所有文本分割器的抽象基类。
from langchain_text_splitters import TextSplitter
class TextSplitter(ABC):
"""文本分割器基类"""
def split_text(self, text: str) -> List[str]:
"""
分割文本
Args:
text: 要分割的文本
Returns:
分割后的文本块列表
"""
def split_documents(self, documents: List[Document]) -> List[Document]:
"""
分割文档列表
Args:
documents: Document 列表
Returns:
分割后的 Document 列表
"""
@staticmethod
def _join_docs(docs: List[str], separator: str) -> str:
"""合并文档块"""
def create_documents(
self,
texts: List[str],
metadatas: Optional[List[dict]] = None,
) -> List[Document]:
"""从文本创建文档"""
def split_text_to_chunks(self, text: str) -> Iterator[str]:
"""生成文本块(迭代器)"""
常用分割器
RecursiveCharacterTextSplitter
递归字符分割器(推荐)。
from langchain_text_splitters import RecursiveCharacterTextSplitter
class RecursiveCharacterTextSplitter(TextSplitter):
"""递归字符分割器"""
def __init__(
self,
separators: Optional[List[str]] = None,
keep_separator: bool = True,
chunk_size: int = 4000,
chunk_overlap: int = 200,
length_function: Callable[[str], int] = len,
strip_whitespace: bool = True,
):
"""
初始化
Args:
separators: 分隔符列表(按优先级)
默认: ["\n\n", "\n", " ", ""]
keep_separator: 是否在结果中保留分隔符
chunk_size: 块大小(字符数)
chunk_overlap: 块之间的重叠字符数
length_function: 计算长度的函数
strip_whitespace: 是否去除空白
"""
def split_text(self, text: str) -> List[str]:
"""
递归分割文本
优先使用前一个分隔符分割,
如果块仍太大,则使用下一个分隔符
"""
使用示例
python
from langchain_text_splitters import RecursiveCharacterTextSplitter
# 基础使用
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
chunks = splitter.split_text(large_text)
# 自定义分隔符
splitter = RecursiveCharacterTextSplitter(
separators=["\n\n\n", "\n\n", "\n", "。", "!", "?", " ", ""],
chunk_size=500,
chunk_overlap=50
)
# 分割文档
from langchain_core.documents import Document
documents = splitter.split_documents([
Document(page_content=large_text)
])
# 使用 token 计数
import tiktoken
encoding = tiktoken.get_encoding("cl100k_base")
def token_count(text: str) -> int:
return len(encoding.encode(text))
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, # 1000 tokens
chunk_overlap=200,
length_function=token_count
)
CharacterTextSplitter
简单字符分割器。
from langchain_text_splitters import CharacterTextSplitter
class CharacterTextSplitter(TextSplitter):
"""字符分割器"""
def __init__(
self,
separator: str = "\n\n",
keep_separator: bool = False,
chunk_size: int = 4000,
chunk_overlap: int = 200,
length_function: Callable[[str], int] = len,
strip_whitespace: bool = True,
):
"""
初始化
Args:
separator: 分隔符(仅使用一个)
keep_separator: 是否保留分隔符
chunk_size: 块大小
chunk_overlap: 重叠大小
length_function: 长度计算函数
strip_whitespace: 是否去除空白
"""
def split_text(self, text: str) -> List[str]:
"""
按分隔符分割文本
"""
使用示例
python
from langchain_text_splitters import CharacterTextSplitter
# 按段落分割
splitter = CharacterTextSplitter(
separator="\n\n",
chunk_size=1000,
chunk_overlap=100
)
chunks = splitter.split_text(text)
# 按句子分割
splitter = CharacterTextSplitter(
separator="。",
chunk_size=500,
chunk_overlap=50,
keep_separator=True
)
SemanticTextSplitter
语义分割器(基于嵌入相似度)。
from langchain_experimental.text_splitter import SemanticTextSplitter
class SemanticTextSplitter(TextSplitter):
"""语义分割器"""
def __init__(
self,
embeddings: Embeddings,
breakpoint_threshold: float = 0.5,
buffer_size: int = 1,
chunk_size: int = 1000,
):
"""
初始化
Args:
embeddings: 嵌入模型
breakpoint_threshold: 分割阈值(0-1)
buffer_size: 缓冲区大小
chunk_size: 目标块大小
"""
def split_text(self, text: str) -> List[str]:
"""
基于语义相似度分割
在语义变化大的位置分割
"""
使用示例
python
from langchain_experimental.text_splitter import SemanticTextSplitter
from langchain_openai import OpenAIEmbeddings
splitter = SemanticTextSplitter(
embeddings=OpenAIEmbeddings(),
breakpoint_threshold=0.6,
chunk_size=1000
)
chunks = splitter.split_text(long_article)
代码分割器
PythonCodeSplitter
from langchain_text_splitters import PythonCodeSplitter
splitter = PythonCodeSplitter(
chunk_size=1000,
chunk_overlap=100
)
# 按函数、类定义分割
chunks = splitter.split_text(python_code)
JavaScriptSplitter
from langchain_text_splitters import JavaScriptSplitter
splitter = JavaScriptSplitter(
chunk_size=1000,
chunk_overlap=100
)
chunks = splitter.split_text(js_code)
MarkdownTextSplitter
Markdown 文本分割器。
from langchain_text_splitters import MarkdownTextSplitter
class MarkdownTextSplitter(TextSplitter):
"""Markdown 分割器"""
def __init__(
self,
headers_to_split_on: List[Tuple[str, str]] = [
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
],
chunk_size: int = 4000,
chunk_overlap: int = 200,
):
"""
初始化
Args:
headers_to_split_on: 要分割的标题级别
chunk_size: 块大小
chunk_overlap: 重叠大小
"""
使用示例
python
from langchain_text_splitters import MarkdownTextSplitter
splitter = MarkdownTextSplitter(
headers_to_split_on=[
("#", "Header 1"),
("##", "Header 2"),
("###", "Header 3"),
],
chunk_size=2000,
chunk_overlap=200
)
chunks = splitter.split_text(markdown_content)
# 带元数据的分割
from langchain_text_splitters import MarkdownHeaderTextSplitter
splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=[
("#", "Header 1"),
("##", "Header 2"),
]
)
docs = splitter.split_text(markdown_content)
# 每个 Document 包含对应级别的标题作为 metadata
LatexTextSplitter
LaTeX 文本分割器。
from langchain_text_splitters import LatexTextSplitter
splitter = LatexTextSplitter(
chunk_size=1000,
chunk_overlap=100
)
chunks = splitter.split_text(latex_content)
NLTKTextSplitter
使用 NLTK 进行句子分割。
from langchain_text_splitters import NLTKTextSplitter
splitter = NLTKTextSplitter(
chunk_size=1000,
chunk_overlap=100
)
chunks = splitter.split_text(text)
SpacyTextSplitter
使用 spaCy 进行句子分割。
from langchain_text_splitters import SpacyTextSplitter
splitter = SpacyTextSplitter(
chunk_size=1000,
chunk_overlap=100,
separator=" " # 使用 spaCy 的句子边界
)
chunks = splitter.split_text(text)
使用示例
python
# ========== 示例1: 递归分割 ==========
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200,
length_function=len,
)
chunks = splitter.split_text(large_text)
print(f"分割成 {len(chunks)} 块")
# ========== 示例2: 分割文档列表 ==========
from langchain_core.documents import Document
documents = [
Document(page_content="文档1内容..."),
Document(page_content="文档2内容...")
]
split_docs = splitter.split_documents(documents)
# ========== 示例3: 自定义分隔符 ==========
splitter = RecursiveCharacterTextSplitter(
separators=["\n\n\n", "\n\n", "\n", "。", "!", "?", " ", ""],
chunk_size=500,
chunk_overlap=50,
keep_separator=True
)
# ========== 示例4: 使用 token 计数 ==========
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4")
def token_count(text: str) -> int:
return len(encoding.encode(text))
splitter = RecursiveCharacterTextSplitter(
chunk_size=1000, # tokens
chunk_overlap=200,
length_function=token_count
)
# ========== 示例5: 添加元数据 ==========
from langchain_core.documents import Document
from typing import Dict, List
class MetadataSplitter(RecursiveCharacterTextSplitter):
def split_text(self, text: str, metadata: Dict = None) -> List[Document]:
chunks = super().split_text(text)
return [
Document(
page_content=chunk,
metadata=metadata or {}
)
for chunk in chunks
]
splitter = MetadataSplitter(chunk_size=1000, chunk_overlap=200)
docs = splitter.split_text(text, metadata={"source": "document.txt"})
# ========== 示例6: HTML 内容分割 ==========
from langchain_text_splitters import HTMLHeaderTextSplitter
html_splitter = HTMLHeaderTextSplitter(
headers_to_split_on=[
("h1", "Header 1"),
("h2", "Header 2"),
("h3", "Header 3"),
]
)
docs = html_splitter.split_text(html_content)
# ========== 示例7: 递归分割 HTML ==========
from langchain_text_splitters import RecursiveCharacterTextSplitter
html_splitter = HTMLHeaderTextSplitter()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000)
# 先按 HTML 结构分割,再按字符分割
html_docs = html_splitter.split_text(html_content)
final_docs = text_splitter.split_documents(html_docs)
# ========== 示例8: 流式分割大文件 ==========
def split_large_file(file_path: str, chunk_size: int = 1000):
"""流式分割大文件"""
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=200
)
with open(file_path, 'r') as f:
content = f.read()
for i, chunk in enumerate(splitter.split_text(content)):
yield {
"index": i,
"content": chunk,
"metadata": {"source": file_path, "chunk": i}
}
# 使用
for chunk_data in split_large_file("large.txt"):
process_chunk(chunk_data)