向量存储 API

VectorStore 与向量数据库

概述

向量存储用于存储和检索高维向量,支持语义相似度搜索,是 RAG 应用的核心组件。

graph TD
                    A[VectorStore] --> B[Chroma]
                    A --> C[FAISS]
                    A --> D[Pinecone]
                    A --> E[Weaviate]
                    A --> F[Milvus]
                    A --> G[PineconeVectorStore]
                    A --> H[ElasticVectorSearch]

                    A --> I[本地存储]
                    I --> J[Chroma]
                    I --> K[FAISS]

                    A --> L[云端存储]
                    L --> M[Pinecone]
                    L --> N[Weaviate]

                    style A fill:#e1f5fe
                    style I fill:#c8e6c9

基类

VectorStore

向量存储抽象基类。

from langchain_core.vectorstores import VectorStore

class VectorStore(ABC):
    """向量存储基类"""

    @property
    @abstractmethod
    def embeddings(self) -> Embeddings:
        """关联的嵌入模型"""

    @abstractmethod
    def add_texts(
        self,
        texts: List[str],
        metadatas: Optional[List[dict]] = None,
        **kwargs: Any,
    ) -> List[str]:
        """
        添加文本

        Args:
            texts: 文本列表
            metadatas: 元数据列表
            **kwargs: 额外参数

        Returns:
            添加的文档 ID 列表
        """

    @abstractmethod
    def similarity_search(
        self,
        query: str,
        k: int = 4,
        **kwargs: Any,
    ) -> List[Document]:
        """
        相似度搜索

        Args:
            query: 查询文本
            k: 返回结果数量
            **kwargs: 额外参数

        Returns:
            最相似的 Document 列表
        """

    @abstractmethod
    def similarity_search_with_score(
        self,
        query: str,
        k: int = 4,
        **kwargs: Any,
    ) -> List[Tuple[Document, float]]:
        """
        带分数的相似度搜索

        Returns:
            (Document, 相似度分数) 列表
        """

    def similarity_search_by_vector(
        self,
        embedding: List[float],
        k: int = 4,
        **kwargs: Any,
    ) -> List[Document]:
        """
        按向量搜索

        Args:
            embedding: 查询向量
            k: 返回结果数量

        Returns:
            最相似的 Document 列表
        """

    def max_marginal_relevance_search(
        self,
        query: str,
        k: int = 4,
        fetch_k: int = 20,
        lambda_mult: float = 0.5,
        **kwargs: Any,
    ) -> List[Document]:
        """
        最大边际相关性搜索 (MMR)

        Args:
            query: 查询文本
            k: 返回结果数量
            fetch_k: 获取候选数量
            lambda_mult: 相关性多样性平衡
                (0=多样性优先, 1=相关性优先)

        Returns:
            Document 列表
        """

    async def asimilarity_search(
        self,
        query: str,
        k: int = 4,
        **kwargs: Any,
    ) -> List[Document]:
        """异步相似度搜索"""

    async def aadd_texts(
        self,
        texts: List[str],
        metadatas: Optional[List[dict]] = None,
        **kwargs: Any,
    ) -> List[str]:
        """异步添加文本"""

    @classmethod
    @abstractmethod
    def from_texts(
        cls,
        texts: List[str],
        embedding: Embeddings,
        metadatas: Optional[List[dict]] = None,
        **kwargs: Any,
    ) -> "VectorStore":
        """
        从文本创建向量存储

        Args:
            texts: 文本列表
            embedding: 嵌入模型
            metadatas: 元数据列表
            **kwargs: 额外参数

        Returns:
            VectorStore 实例
        """

    @classmethod
    def from_documents(
        cls,
        documents: List[Document],
        embedding: Embeddings,
        **kwargs: Any,
    ) -> "VectorStore":
        """
        从文档创建向量存储

        Args:
            documents: Document 列表
            embedding: 嵌入模型
            **kwargs: 额外参数

        Returns:
            VectorStore 实例
        """

    def delete(
        self,
        ids: Optional[List[str]] = None,
        **kwargs: Any,
    ) -> Optional[bool]:
        """
        删除文档

        Args:
            ids: 要删除的文档 ID 列表
            **kwargs: 额外参数

        Returns:
            是否成功
        """

    def as_retriever(
        self,
        **kwargs: Any,
    ) -> VectorStoreRetriever:
        """
        转换为检索器

        Args:
            **kwargs: 检索器参数
                - search_type: "similarity" 或 "mmr"
                - search_kwargs: 搜索参数 (k, score_threshold 等)

        Returns:
            VectorStoreRetriever 实例
        """

实现类

Chroma

Chroma 向量数据库(本地)。

from langchain_chroma import Chroma

class Chroma(VectorStore):
    """Chroma 向量存储"""

    def __init__(
        self,
        collection_name: str = "langchain",
        embedding_function: Optional[Embeddings] = None,
        persist_directory: Optional[str] = None,
        client_settings: Optional[ChromaClientSettings] = None,
        collection_metadata: Optional[dict] = None,
        client: Optional[ChromaClient] = None,
    ):
        """
        初始化 Chroma

        Args:
            collection_name: 集合名称
            embedding_function: 嵌入模型
            persist_directory: 持久化目录
            client_settings: 客户端设置
            collection_metadata: 集合元数据
            client: 自定义客户端
        """

使用示例

python
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

# 内存存储
vectorstore = Chroma(
    embedding_function=OpenAIEmbeddings()
)

# 持久化存储
vectorstore = Chroma(
    collection_name="my_collection",
    embedding_function=OpenAIEmbeddings(),
    persist_directory="./chroma_db"
)

# 从文本创建
vectorstore = Chroma.from_texts(
    texts=["文档1", "文档2", "文档3"],
    embedding=OpenAIEmbeddings(),
    metadatas=[
        {"source": "txt1"},
        {"source": "txt2"},
        {"source": "txt3"}
    ],
    collection_name="my_collection"
)

# 相似度搜索
results = vectorstore.similarity_search("查询", k=3)

FAISS

FAISS 向量索引(本地)。

from langchain_community.vectorstores import FAISS

class FAISS(VectorStore):
    """FAISS 向量存储"""

    def __init__(
        self,
        embedding_function: Embeddings,
        index: Any,
        docstore: Docstore,
        index_to_docstore_id: Dict[int, str],
    ):
        """
        初始化 FAISS

        Args:
            embedding_function: 嵌入模型
            index: FAISS 索引
            docstore: 文档存储
            index_to_docstore_id: 索引到文档 ID 的映射
        """

使用示例

python
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings

# 从文本创建
vectorstore = FAISS.from_texts(
    texts=["文档1", "文档2", "文档3"],
    embedding=OpenAIEmbeddings()
)

# 添加文档
vectorstore.add_texts(["新文档"])

# 相似度搜索
results = vectorstore.similarity_search("查询", k=3)

# 保存索引
vectorstore.save_local("faiss_index")

# 加载索引
vectorstore = FAISS.load_local(
    "faiss_index",
    OpenAIEmbeddings(),
    allow_dangerous_deserialization=True
)

# MMR 搜索
results = vectorstore.max_marginal_relevance_search(
    "查询",
    k=3,
    lambda_mult=0.5
)

PineconeVectorStore

Pinecone 云端向量数据库。

from langchain_pinecone import PineconeVectorStore

class PineconeVectorStore(VectorStore):
    """Pinecone 向量存储"""

    @classmethod
    def from_texts(
        cls,
        texts: List[str],
        embedding: Embeddings,
        index_name: str,
        namespace: str = "",
        metadatas: Optional[List[dict]] = None,
        **kwargs: Any,
    ) -> "PineconeVectorStore":
        """
        从文本创建

        Args:
            texts: 文本列表
            embedding: 嵌入模型
            index_name: Pinecone 索引名称
            namespace: 命名空间
            metadatas: 元数据列表
            **kwargs: 额外参数
        """

使用示例

python
from langchain_pinecone import PineconeVectorStore
from langchain_openai import OpenAIEmbeddings

# 初始化
import os
os.environ["PINECONE_API_KEY"] = "your-api-key"

vectorstore = PineconeVectorStore.from_texts(
    texts=["文档1", "文档2"],
    embedding=OpenAIEmbeddings(),
    index_name="my-index",
    namespace="my-namespace"
)

# 搜索
results = vectorstore.similarity_search("查询", k=3)

WeaviateClient

Weaviate 向量数据库。

from langchain_weaviate.vectorstores import WeaviateVectorStore

vectorstore = WeaviateVectorStore.from_texts(
    texts=["文档1", "文档2"],
    embedding=OpenAIEmbeddings(),
    index_name="MyIndex",
    text_key="text"
)

Milvus

Milvus 向量数据库。

from langchain_community.vectorstores import Milvus

vectorstore = Milvus.from_texts(
    texts=["文档1", "文档2"],
    embedding=OpenAIEmbeddings(),
    connection_args={"host": "localhost", "port": 19530},
    collection_name="my_collection"
)

Qdrant

Qdrant 向量数据库。

from langchain_qdrant import QdrantVectorStore

from qdrant_client import QdrantClient

# 内存模式
client = QdrantClient(":memory:")
vectorstore = QdrantVectorStore.from_texts(
    texts=["文档1", "文档2"],
    embedding=OpenAIEmbeddings(),
    client=client,
    collection_name="my_collection"
)

ElasticVectorSearch

Elasticsearch 向量搜索。

from langchain_community.vectorstores import ElasticsearchStore

vectorstore = ElasticsearchStore.from_texts(
    texts=["文档1", "文档2"],
    embedding=OpenAIEmbeddings(),
    index_name="my-index",
    es_url="http://localhost:9200"
)

使用示例

python
# ========== 示例1: 创建向量存储 ==========
from langchain_chroma import Chroma
from langchain_openai import OpenAIEmbeddings

vectorstore = Chroma(
    collection_name="my_docs",
    embedding_function=OpenAIEmbeddings(),
    persist_directory="./chroma_db"
)

# ========== 示例2: 添加文档 ==========
from langchain_core.documents import Document

documents = [
    Document(page_content="Python 是一种编程语言", metadata={"id": 1}),
    Document(page_content="JavaScript 也是一种编程语言", metadata={"id": 2})
]

vectorstore.add_documents(documents)

# ========== 示例3: 相似度搜索 ==========
results = vectorstore.similarity_search("编程语言", k=2)

for doc in results:
    print(f"内容: {doc.page_content}")
    print(f"元数据: {doc.metadata}")

# ========== 示例4: 带分数搜索 ==========
results = vectorstore.similarity_search_with_score("编程语言", k=2)

for doc, score in results:
    print(f"分数: {score:.4f}, 内容: {doc.page_content}")

# ========== 示例5: 按向量搜索 ==========
query_vector = OpenAIEmbeddings().embed_query("编程语言")
results = vectorstore.similarity_search_by_vector(query_vector, k=3)

# ========== 示例6: MMR 搜索 ==========
results = vectorstore.max_marginal_relevance_search(
    "编程语言",
    k=3,
    fetch_k=10,
    lambda_mult=0.5
)

# ========== 示例7: 过滤搜索 ==========
from langchain_chroma import Chroma

# 元数据过滤
results = vectorstore.similarity_search(
    "编程语言",
    k=3,
    filter={"id": {"$eq": 1}}
)

# ========== 示例8: 转换为检索器 ==========
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

# 使用检索器
results = retriever.invoke("编程语言")

# ========== 示例9: MMR 检索器 ==========
retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={
        "k": 3,
        "fetch_k": 10,
        "lambda_mult": 0.5
    }
)

# ========== 示例10: 删除文档 ==========
ids = ["doc_id_1", "doc_id_2"]
vectorstore.delete(ids)

# ========== 示例11: FAISS 索引 ==========
from langchain_community.vectorstores import FAISS

# 创建索引
vectorstore = FAISS.from_texts(
    texts=["文档1", "文档2"],
    embedding=OpenAIEmbeddings()
)

# 保存
vectorstore.save_local("faiss_index")

# 加载
vectorstore = FAISS.load_local(
    "faiss_index",
    OpenAIEmbeddings(),
    allow_dangerous_deserialization=True
)

# ========== 示例12: 批量添加 ==========
texts = ["文档" + str(i) for i in range(1000)]
metadatas = [{"index": i} for i in range(1000)]

vectorstore.add_texts(texts, metadatas)

相关 API