Module base

BasePreProcessor

class BasePreProcessor(BaseComponent)

BasePreProcessor.process

@abstractmethod
def process(documents: Union[dict, Document, List[Union[dict, Document]]],
            clean_whitespace: Optional[bool] = True,
            clean_header_footer: Optional[bool] = False,
            clean_empty_lines: Optional[bool] = True,
            remove_substrings: Optional[List[str]] = None,
            split_by: Literal["token", "word", "sentence", "passage",
                              None] = "word",
            split_length: Optional[int] = 1000,
            split_overlap: Optional[int] = None,
            split_respect_sentence_boundary: Optional[bool] = True,
            tokenizer: Optional[Union[str,
                                      PreTrainedTokenizerBase]] = "tiktoken",
            id_hash_keys: Optional[List[str]] = None) -> List[Document]

Perform document cleaning and splitting. Takes a single Document or a List of Documents as input and returns a list of Documents.