一、介绍
1、背景
在构建现代 RAG(Retrieval-Augmented Generation)系统时,最核心的矛盾之一是:检索需要“精确命中”,但生成需要“足够上下文”。如果只做粗粒度切块(chunk),很容易出现命中不准;如果切得太细,又会导致上下文缺失。节点-句子滑动窗口检索正是为了解决这个问题而提出的一种工程化方案,它在 LlamaIndex 等框架中被广泛使用。
2、核心思想——从小到大
保证检索足够精准,同时不丢上下文语义。检索用“句子级别”,生成用“窗口级别”。
也就是说,在索引阶段,我们把文档拆分成最小语义单位——句子,每个句子作为一个独立节点参与向量化或关键词检索;但在返回结果时,并不只返回这一句话,而是返回它前后扩展的一段“窗口文本”。
这个窗口通常包含:
- 当前句子
- 前面 N 句
- 后面 N 句
句子滑动窗口检索想要的是:
- 召回精度接近句子级
- 上下文完整性接近段落级
3、它和普通“滑动窗口切片”有什么区别
| 方式 | 切分方式 | 存储什么 | 检索时怎么用 |
|---|---|---|---|
| 普通滑动窗口切片 | 按固定长度文本块 + overlap | 直接存重叠 chunk | 直接拿 chunk 给 LLM |
| 句子滑动窗口检索 | 按句子切 | 存“中心句 + 前后窗口” | 先召回句子,再用窗口替换 |
| 传统 window-based passage retrieval | 按固定窗口在全文上滑动 | 每个窗口都算一个 passage | 取得分最高的窗口 |
二、实现方法
LlamaIndex 官方把这套模式叫SentenceWindowNodeParser + MetadataReplacementPostProcessor:
每个 node 是一个句子,metadata 里保存这个句子的上下文窗口;检索后再用窗口替换原句。
from llama_index.core import VectorStoreIndex from llama_index.core.node_parser import SentenceWindowNodeParser from llama_index.core.postprocessor import MetadataReplacementPostProcessor # 1. 句子级切分 + 窗口写入 metadata node_parser = SentenceWindowNodeParser.from_defaults( window_size=2, # 每边 2 句 window_metadata_key="window", original_text_metadata_key="original_text", ) nodes = node_parser.get_nodes_from_documents(documents) # 2. 建索引 index = VectorStoreIndex(nodes) # 3. 检索后,用 window 替换原句 query_engine = index.as_query_engine( similarity_top_k=3, node_postprocessors=[ MetadataReplacementPostProcessor(target_metadata_key="window") ], ) resp = query_engine.query("电热水壶第一次使用前要做什么?") print(resp)流程:
SentenceWindowNodeParser 的做法
每个 sentence 单独变成 node,如:
node1:
text:
Spring Boot 启动时会创建 ApplicationContext。metadata:
{
"window": "
Spring Boot 启动时会创建 ApplicationContext。
ApplicationReadyEvent 会在应用启动完成后触发。
"
}
node2:
text:
ApplicationReadyEvent 会在应用启动完成后触发。metadata:
{
"window": "
Spring Boot 启动时会创建 ApplicationContext。
ApplicationReadyEvent 会在应用启动完成后触发。
开发者可以监听该事件执行初始化逻辑。
"
}
node3:
text:
开发者可以监听该事件执行初始化逻辑。metadata:
{
"window": "
ApplicationReadyEvent 会在应用启动完成后触发。
开发者可以监听该事件执行初始化逻辑。
例如启动 HTTP Server。
"
}
检索阶段
真正参与 embedding / 检索的是:单句 node.text
召回完成后
LlamaIndex 不会直接把:
ApplicationReadyEvent 会在应用启动完成后触发。
发给 LLM。而是:把 node 替换成 metadata.window
Spring Boot 启动时会创建 ApplicationContext。
ApplicationReadyEvent 会在应用启动完成后触发。
开发者可以监听该事件执行初始化逻辑。
这就叫:MetadataReplacementPostProcessor
三、demo
1、common
import sys from pathlib import Path import httpx import regex from llama_index.core.node_parser import SentenceWindowNodeParser from llama_index.core.schema import Document from langchain_openai import ChatOpenAI, OpenAIEmbeddings from langchain_qdrant import QdrantVectorStore from qdrant_client import QdrantClient sys.stdout.reconfigure(encoding="utf-8") DEMO_DIR = Path(__file__).resolve().parents[1] DOC_DIR = DEMO_DIR / "docs" / "small_appliance_kb" COLLECTION_NAME = "window_db" QDRANT_URL = "http://localhost:6333" WINDOW_SIZE = 2 EMBEDDING_MODEL = "text-embedding-3-small" LLM_MODEL = "gpt-5.1" BASE_URL = "https://llm-xxx.xxxx/v1" API_KEY = "l******" WINDOW_METADATA_KEY = "window" ORIGINAL_TEXT_METADATA_KEY = "original_text" class CompatQdrantClient(QdrantClient): # Keep LangChain compatibility with the older search() call path. def search( self, *, collection_name, query_vector, query_filter=None, search_params=None, limit=10, offset=0, with_payload=True, with_vectors=False, score_threshold=None, consistency=None, **kwargs, ): return self.query_points( collection_name=collection_name, query=query_vector, query_filter=query_filter, search_params=search_params, limit=limit, offset=offset, with_payload=with_payload, with_vectors=with_vectors, score_threshold=score_threshold, consistency=consistency, **kwargs, ).points def make_client() -> CompatQdrantClient: return CompatQdrantClient(url=QDRANT_URL) def make_embeddings() -> OpenAIEmbeddings: # Step 1: build the embedding model used by Qdrant. return OpenAIEmbeddings( model=EMBEDDING_MODEL, api_key="dummy", base_url=BASE_URL, default_headers={"X-Api-Key": API_KEY}, http_client=httpx.Client(trust_env=False), ) def make_vectorstore(client: QdrantClient) -> QdrantVectorStore: return QdrantVectorStore( client=client, collection_name=COLLECTION_NAME, embedding=make_embeddings(), ) def make_llm() -> ChatOpenAI: return ChatOpenAI( model=LLM_MODEL, base_url=BASE_URL, api_key="dummy", default_headers={"X-Api-Key": API_KEY}, http_client=httpx.Client(trust_env=False), ) def list_doc_paths() -> list[Path]: if not DOC_DIR.exists(): return [] return sorted( path for path in DOC_DIR.iterdir() if path.is_file() and path.suffix.lower() in {".md", ".txt"} ) def split_sentences(text: str) -> list[str]: sentences: list[str] = [] normalized = text.replace("\r\n", "\n").strip() if not normalized: return [] for line in normalized.split("\n"): line = line.strip() if not line: continue parts = [ part.strip() for part in regex.split(r"(?<=[。!?!?])\s*", line, flags=regex.VERSION1) if part.strip() ] sentences.extend(parts or [line]) return sentences def build_sentence_nodes(doc_paths: list[Path] | None = None, window_size: int = WINDOW_SIZE): if doc_paths is None: doc_paths = list_doc_paths() if not doc_paths: return [] documents = [ Document( text=path.read_text(encoding="utf-8"), metadata={"source": path.name}, ) for path in doc_paths ] parser = SentenceWindowNodeParser.from_defaults( window_size=window_size, window_metadata_key=WINDOW_METADATA_KEY, original_text_metadata_key=ORIGINAL_TEXT_METADATA_KEY, sentence_splitter=split_sentences, ) return parser.get_nodes_from_documents(documents)2、store
from qdrant_client.models import Distance, VectorParams from common import ( COLLECTION_NAME, DOC_DIR, QDRANT_URL, build_sentence_nodes, list_doc_paths, make_client, make_vectorstore, ) client = make_client() doc_paths = list_doc_paths() if not doc_paths: raise SystemExit(f"no documents found in: {DOC_DIR}") nodes = build_sentence_nodes(doc_paths) if not nodes: raise SystemExit(f"no sentences generated from: {DOC_DIR}") # Rebuild the collection so the demo always stays in sync with the docs folder. if client.collection_exists(COLLECTION_NAME): client.delete_collection(COLLECTION_NAME) client.create_collection( collection_name=COLLECTION_NAME, vectors_config=VectorParams(size=1536, distance=Distance.COSINE), ) vectorstore = make_vectorstore(client) vectorstore.add_texts( [node.text for node in nodes], metadatas=[dict(node.metadata or {}) for node in nodes], ) print(f"stored sentences: {len(nodes)}") print(f"collection: {COLLECTION_NAME}") print(f"qdrant: {QDRANT_URL}") print("sources:") for path in doc_paths: print(f"- {path.name}") client.close()执行
3、ask
from llama_index.core.postprocessor import MetadataReplacementPostProcessor from llama_index.core.schema import NodeWithScore, TextNode from langchain_core.prompts import ChatPromptTemplate from common import COLLECTION_NAME, WINDOW_METADATA_KEY, make_client, make_llm, make_vectorstore client = make_client() if not client.collection_exists(COLLECTION_NAME): raise SystemExit("run store.py first to build the collection.") vectorstore = make_vectorstore(client) question = "电热水壶第一次使用前要做什么?" # Step 1: retrieve the most relevant sentence nodes. hits = vectorstore.similarity_search_with_score(question, k=3) nodes = [ NodeWithScore( node=TextNode(text=doc.page_content, metadata=dict(doc.metadata or {})), score=score, ) for doc, score in hits ] # Step 2: replace each sentence with its surrounding window. postprocessor = MetadataReplacementPostProcessor(target_metadata_key=WINDOW_METADATA_KEY) processed_nodes = postprocessor.postprocess_nodes(nodes) context = "\n\n".join(node.node.text for node in processed_nodes) prompt = ChatPromptTemplate.from_messages( [ ( "system", "你是小家电智能客服,只能根据上下文回答,尽量简洁直接,不要补充上下文之外的信息。", ), ("human", "上下文:\n{context}\n\n问题: {question}"), ] ) llm = make_llm() resp = llm.invoke(prompt.format_messages(context=context, question=question)) print("\nanswer:") print(resp.content) client.close()