CacheChecker

This component checks for the presence of Documents in a Document Store based on a specified cache field.

Name	CacheChecker
Folder Path	/caching/
Most common Position in a Pipeline	Flexible
Mandatory Input variables	“items”: a list of values associated with the cache_field in Documents
Output variables	“hits”: a list of Documents that were found with specified value in cache ”misses”: a list of values that could not be found

Overview

CacheChecker checks if a Document Store contains any Document with a value in the cache_field that matches any of the values provided in the items input variable. It returns a dictionary with two keys: "hits" and "misses". The values are lists of Documents that were found in the cache and items that were not, respectively.

Usage

On its own

from haystack.components.caching import CacheChecker

# For URL-based caching
cache_checker = CacheChecker(document_store=my_doc_store, cache_field="url")
cache_check_results = cache_checker.run(items=["https://example.com/resource", "https://another_example.com/other_resources"])
print(cache_check_results["hits"])    # List of Documents that were found in the cache: all of these have 'url': <one of the above> in the metadata
print(cache_check_results["misses"])  # URLs that were not found in the cache, like ["https://example.com/resource"]

# For caching based on a custom identifier
cache_checker = CacheChecker(document_store=my_doc_store, cache_field="metadata_field")
cache_check_results = cache_checker.run(items=["12345", "ABCDE"])
print(cache_check_results["hits"])    # Documents that were found in the cache: all of these have 'metadata_field': <one of the above> in the metadata
print(cache_check_results["misses"])  # Values that were not found in the cache, like: ["ABCDE"]

In a Pipeline

from typing import Union

from pathlib import Path

from haystack import Pipeline
from haystack.dataclasses import ByteStream
from haystack.components.converters import TextFileToDocument
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.writers import DocumentWriter
from haystack.components.caching import CacheChecker
from haystack.document_stores.in_memory import InMemoryDocumentStore

pipeline = Pipeline()
document_store = InMemoryDocumentStore()
pipeline.add_component(instance=CacheChecker(document_store, cache_field="file_path"), name="cache_checker")
pipeline.add_component(instance=TextFileToDocument(), name="text_file_converter")
pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
pipeline.add_component(instance=DocumentSplitter(split_by="sentence", split_length=250, split_overlap=30), name="splitter")
pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="writer")
pipeline.connect("cache_checker.misses", "text_file_converter.sources")
pipeline.connect("text_file_converter.documents", "cleaner.documents")
pipeline.connect("cleaner.documents", "splitter.documents")
pipeline.connect("splitter.documents", "writer.documents")

pipeline.draw("pipeline.png")

# Take the current directory as input and run the pipeline
result = pipeline.run({"cache_checker": {"items": ["code_of_conduct.txt"]}})
print(result)

# The second execution skips the files that were already processed
result = pipeline.run({"cache_checker": {"items": ["code_of_conduct.txt"]}})
print(result)

Updated about 9 hours ago