DocumentationAPI ReferenceTutorialsGitHub Code ExamplesDiscord Community

Abstract class for the PreProcessor.

Module base

BasePreProcessor

class BasePreProcessor(BaseComponent)

BasePreProcessor.process

@abstractmethod
def process(documents: Union[dict, Document, List[Union[dict, Document]]],
            clean_whitespace: Optional[bool] = True,
            clean_header_footer: Optional[bool] = False,
            clean_empty_lines: Optional[bool] = True,
            remove_substrings: Optional[List[str]] = None,
            split_by: Literal["word", "sentence", "passage", None] = "word",
            split_length: Optional[int] = 1000,
            split_overlap: Optional[int] = None,
            split_respect_sentence_boundary: Optional[bool] = True,
            id_hash_keys: Optional[List[str]] = None) -> List[Document]

Perform document cleaning and splitting. Takes a single Document or a List of Documents as input and returns a list of Documents.