Sweep through Document Stores and return a set of candidate documents that are relevant to the query.
Module haystack_experimental.components.retrievers.chat_message_retriever
ChatMessageRetriever
Retrieves chat messages from the underlying ChatMessageStore.
Usage example:
from haystack.dataclasses import ChatMessage
from haystack_experimental.components.retrievers import ChatMessageRetriever
from haystack_experimental.chat_message_stores.in_memory import InMemoryChatMessageStore
messages = [
ChatMessage.from_assistant("Hello, how can I help you?"),
ChatMessage.from_user("Hi, I have a question about Python. What is a Protocol?"),
]
message_store = InMemoryChatMessageStore()
message_store.write_messages(messages)
retriever = ChatMessageRetriever(message_store)
result = retriever.run()
print(result["messages"])
ChatMessageRetriever.__init__
def __init__(message_store: ChatMessageStore, last_k: int = 10)
Create the ChatMessageRetriever component.
Arguments:
message_store
: An instance of a ChatMessageStore.last_k
: The number of last messages to retrieve. Defaults to 10 messages if not specified.
ChatMessageRetriever.to_dict
def to_dict() -> Dict[str, Any]
Serializes the component to a dictionary.
Returns:
Dictionary with serialized data.
ChatMessageRetriever.from_dict
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "ChatMessageRetriever"
Deserializes the component from a dictionary.
Arguments:
data
: The dictionary to deserialize from.
Returns:
The deserialized component.
ChatMessageRetriever.run
@component.output_types(messages=List[ChatMessage])
def run(last_k: Optional[int] = None) -> Dict[str, List[ChatMessage]]
Run the ChatMessageRetriever
Arguments:
last_k
: The number of last messages to retrieve. This parameter takes precedence over the last_k parameter passed to the ChatMessageRetriever constructor. If unspecified, the last_k parameter passed to the constructor will be used.
Raises:
ValueError
: If last_k is not None and is less than 1
Returns:
messages
- The retrieved chat messages.
Module haystack_experimental.components.retrievers.multi_query_embedding_retriever
MultiQueryEmbeddingRetriever
A component that retrieves documents using multiple queries in parallel with an embedding-based retriever.
This component takes a list of text queries, converts them to embeddings using a query embedder, and then uses an embedding-based retriever to find relevant documents for each query in parallel. The results are combined and sorted by relevance score.
Usage example
from haystack import Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.retrievers import InMemoryEmbeddingRetriever
from haystack.components.writers import DocumentWriter
from haystack_experimental.components.retrievers import MultiQueryEmbeddingRetriever
documents = [
Document(content="Renewable energy is energy that is collected from renewable resources."),
Document(content="Solar energy is a type of green energy that is harnessed from the sun."),
Document(content="Wind energy is another type of green energy that is generated by wind turbines."),
Document(content="Geothermal energy is heat that comes from the sub-surface of the earth."),
Document(content="Biomass energy is produced from organic materials, such as plant and animal waste."),
Document(content="Fossil fuels, such as coal, oil, and natural gas, are non-renewable energy sources."),
]
# Populate the document store
doc_store = InMemoryDocumentStore()
doc_embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
doc_embedder.warm_up()
doc_writer = DocumentWriter(document_store=doc_store, policy=DuplicatePolicy.SKIP)
documents = doc_embedder.run(documents)["documents"]
doc_writer.run(documents=documents)
# Run the multi-query retriever
in_memory_retriever = InMemoryEmbeddingRetriever(document_store=doc_store, top_k=1)
query_embedder = SentenceTransformersTextEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
multi_query_retriever = MultiQueryEmbeddingRetriever(
retriever=in_memory_retriever,
query_embedder=query_embedder,
max_workers=3
)
queries = ["Geothermal energy", "natural gas", "turbines"]
result = multi_query_retriever.run(queries=queries)
for doc in result["documents"]:
print(f"Content: {doc.content}, Score: {doc.score}")
>> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 0.8509603046266574
>> Content: Renewable energy is energy that is collected from renewable resources., Score: 0.42763211298893034
>> Content: Solar energy is a type of green energy that is harnessed from the sun., Score: 0.40077417016494354
>> Content: Fossil fuels, such as coal, oil, and natural gas, are non-renewable energy sources., Score: 0.3774863680995796
>> Content: Wind energy is another type of green energy that is generated by wind turbines., Score: 0.3091423972562246
>> Content: Biomass energy is produced from organic materials, such as plant and animal waste., Score: 0.25173074243668087
MultiQueryEmbeddingRetriever.__init__
def __init__(*,
retriever: EmbeddingRetriever,
query_embedder: TextEmbedder,
max_workers: int = 3)
Initialize MultiQueryEmbeddingRetriever.
Arguments:
retriever
: The embedding-based retriever to use for document retrieval.query_embedder
: The query embedder to convert text queries to embeddings.max_workers
: Maximum number of worker threads for parallel processing.
MultiQueryEmbeddingRetriever.warm_up
def warm_up() -> None
Warm up the query embedder and the retriever if any has a warm_up method.
MultiQueryEmbeddingRetriever.run
@component.output_types(documents=List[Document])
def run(queries: List[str],
retriever_kwargs: Optional[dict[str, Any]] = None) -> dict[str, Any]
Retrieve documents using multiple queries in parallel.
Arguments:
queries
: List of text queries to process.retriever_kwargs
: Optional dictionary of arguments to pass to the retriever's run method.
Returns:
A dictionary containing:
documents
: List of retrieved documents sorted by relevance score.
MultiQueryEmbeddingRetriever.to_dict
def to_dict() -> dict[str, Any]
Serializes the component to a dictionary.
Returns:
A dictionary representing the serialized component.
MultiQueryEmbeddingRetriever.from_dict
@classmethod
def from_dict(cls, data: dict[str, Any]) -> "MultiQueryEmbeddingRetriever"
Deserializes the component from a dictionary.
Arguments:
data
: The dictionary to deserialize from.
Returns:
The deserialized component.
Module haystack_experimental.components.retrievers.multi_query_text_retriever
MultiQueryTextRetriever
A component that retrieves documents using multiple queries in parallel with a text-based retriever.
This component takes a list of text queries and uses a text-based retriever to find relevant documents for each query in parallel, using a thread pool to manage concurrent execution. The results are combined and sorted by relevance score.
You can use this component in combination with QueryExpander component to enhance the retrieval process.
Usage example
from haystack import Document
from haystack.components.writers import DocumentWriter
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.document_stores.types import DuplicatePolicy
from haystack.components.retrievers import InMemoryBM25Retriever
from haystack_experimental.components.query import QueryExpander
from haystack_experimental.components.retrievers.multi_query_text_retriever import MultiQueryTextRetriever
documents = [
Document(content="Renewable energy is energy that is collected from renewable resources."),
Document(content="Solar energy is a type of green energy that is harnessed from the sun."),
Document(content="Wind energy is another type of green energy that is generated by wind turbines."),
Document(content="Hydropower is a form of renewable energy using the flow of water to generate electricity."),
Document(content="Geothermal energy is heat that comes from the sub-surface of the earth.")
]
document_store = InMemoryDocumentStore()
doc_writer = DocumentWriter(document_store=document_store, policy=DuplicatePolicy.SKIP)
doc_writer.run(documents=documents)
in_memory_retriever = InMemoryBM25Retriever(document_store=document_store, top_k=1)
multiquery_retriever = MultiQueryTextRetriever(retriever=in_memory_retriever)
results = multiquery_retriever.run(queries=["renewable energy?", "Geothermal", "Hydropower"])
for doc in results["documents"]:
print(f"Content: {doc.content}, Score: {doc.score}")
>>
>> Content: Geothermal energy is heat that comes from the sub-surface of the earth., Score: 1.6474448833731097
>> Content: Hydropower is a form of renewable energy using the flow of water to generate electricity., Score: 1.6157822790079805
>> Content: Renewable energy is energy that is collected from renewable resources., Score: 1.5255309812344944
MultiQueryTextRetriever.__init__
def __init__(retriever: TextRetriever, max_workers: int = 3)
Initialize MultiQueryTextRetriever.
Arguments:
retriever
: The text-based retriever to use for document retrieval.max_workers
: Maximum number of worker threads for parallel processing. Default is 3.
MultiQueryTextRetriever.warm_up
def warm_up() -> None
Warm up the retriever if it has a warm_up method.
MultiQueryTextRetriever.run
@component.output_types(documents=list[Document])
def run(queries: List[str],
retriever_kwargs: Optional[dict[str, Any]] = None) -> dict[str, Any]
Retrieve documents using multiple queries in parallel.
Arguments:
queries
: List of text queries to process.retriever_kwargs
: Optional dictionary of arguments to pass to the retriever's run method.
Returns:
A dictionary containing:
documents
: List of retrieved documents sorted by relevance score.
MultiQueryTextRetriever.to_dict
def to_dict() -> dict[str, Any]
Serializes the component to a dictionary.
Returns:
The serialized component as a dictionary.
MultiQueryTextRetriever.from_dict
@classmethod
def from_dict(cls, data: dict[str, Any]) -> "MultiQueryTextRetriever"
Deserializes the component from a dictionary.
Arguments:
data
: The dictionary to deserialize from.
Returns:
The deserialized component.