Abstract class for the PreProcessor.
Module base
BasePreProcessor
class BasePreProcessor(BaseComponent)
BasePreProcessor.process
@abstractmethod
def process(documents: Union[dict, Document, List[Union[dict, Document]]],
clean_whitespace: Optional[bool] = True,
clean_header_footer: Optional[bool] = False,
clean_empty_lines: Optional[bool] = True,
remove_substrings: Optional[List[str]] = None,
split_by: Literal["token", "word", "sentence", "passage",
None] = "word",
split_length: Optional[int] = 1000,
split_overlap: Optional[int] = None,
split_respect_sentence_boundary: Optional[bool] = True,
tokenizer: Optional[Union[str,
PreTrainedTokenizerBase]] = "tiktoken",
id_hash_keys: Optional[List[str]] = None) -> List[Document]
Perform document cleaning and splitting. Takes a single Document or a List of Documents as input and returns a list of Documents.