LLM based Curation

LLMCurate

Bases: BaseCurate

Parameters:

Name	Type	Description	Default
`model`	`AutoModelForCausalLM`	Instantiated LLM	required
`tokenizer`	`AutoTokenizer`	Instantiated tokenizer corresponding to the `model`	required
`verbose`	`bool`	Sets the verbosity level during execution. `True` indicates logging level INFO and `False` indicates logging level 'WARNING'. Defaults to False.	`False`

Examples:

```python

llmc = LLMCurate(model, tokenizer) ds = llmc.run( data, column_to_curate, ds_column_mapping, prompt_variants, llm_response_cleaned_column_list, answer_start_token, answer_end_token, batch_size, max_new_tokens ) `` where *modelandtokenizerare the instantiated LLM model and tokenizer objects respectively *datais a pandas dataframe containing samples with our target text for curation under columncolumn_to_curate*ds_column_mappingis the dictionary mapping of entities used in the LLM prompt and the corresponding columns indata. For example,ds_column_mapping={'INPUT' : 'input_column'}would imply that text underinput_columnindatawould be passed to the LLM in the format"[INPUT]row['input_column'][/INPUT]"for eachrowindata*prompt_variantsis the list of LLM prompts to be used to curatecolumn_to_curateandllm_response_cleaned_column_listis the corresponding list of column names to store the reference responses generated using each prompt *answer_start_tokenandanswer_end_token` are optional text phrases representing the start and end of the answer respectively.

ds is a dataset object with the following additional features - 1. Feature for each column name in llm_response_cleaned_column_list 2. LLM Confidence score for each text in column_to_curate

Source code in dqc/llm.py

class LLMCurate(BaseCurate):
    """
    Args:
        model (AutoModelForCausalLM): Instantiated LLM
        tokenizer (AutoTokenizer): Instantiated tokenizer corresponding to the `model`
        verbose (bool, optional): Sets the verbosity level during execution. `True` indicates logging level INFO and `False` indicates logging level 'WARNING'. Defaults to False.

    Examples:
     ```python

    llmc = LLMCurate(model, tokenizer)
    ds = llmc.run(
            data,
            column_to_curate,
            ds_column_mapping,
            prompt_variants,
            llm_response_cleaned_column_list,
            answer_start_token,
            answer_end_token,
            batch_size,
            max_new_tokens
            )
    ```
    where
    * `model` and `tokenizer` are the instantiated LLM model and tokenizer objects respectively
    * `data` is a pandas dataframe containing samples with our target text for curation under column `column_to_curate`
    * `ds_column_mapping` is the dictionary mapping of entities used in the LLM prompt and the corresponding columns in `data`. For example, `ds_column_mapping={'INPUT' : 'input_column'}` would imply that text under `input_column` in `data` would be passed to the LLM in the format `"[INPUT]row['input_column'][/INPUT]"` for each `row` in `data`
    * `prompt_variants` is the list of LLM prompts to be used to curate `column_to_curate` and `llm_response_cleaned_column_list` is the corresponding list of column names to store the reference responses generated using each prompt
    * `answer_start_token` and `answer_end_token` are optional  text phrases representing the start and end of the answer respectively.

    `ds` is a dataset object with the following additional features -
    1. Feature for each column name in `llm_response_cleaned_column_list`
    2. LLM Confidence score for each text in `column_to_curate`

    """

    def __init__(
        self,
        model: AutoModelForCausalLM,
        tokenizer: AutoTokenizer,
        verbose: bool = False,
        **options,
    ):
        super().__init__(**options)

        _validate_init_params(model, tokenizer, verbose)
        self.model = model
        self.tokenizer = tokenizer
        self.verbose = verbose
        self._set_verbosity(verbose)

        self.ds_ensemble = None

    def __str__(self):
        display_dict = self.__dict__.copy()

        for key in list(display_dict.keys()):
            if key in ["ds_ensemble"]:
                ## Don't need to display these attributes
                del display_dict[key]

        return str(display_dict)

    __repr__ = __str__

    def _set_verbosity(self, verbose: bool):
        """Set logger level based on user input for parameter `verbose`

        Args:
            verbose (bool): Indicator for verbosity
        """
        if verbose:
            logger.set_level("INFO")
        else:
            logger.set_level("WARNING")

    def fit_transform(self):
        pass

    def run(
        self,
        column_to_curate: str,
        data: Union[pd.DataFrame, Dataset] = None,
        ds_column_mapping: dict = {},
        prompt_variants: List[str] = [""],
        skip_llm_inference: bool = False,
        llm_response_cleaned_column_list: List[str] = ["reference_prediction"],
        return_scores: bool = True,
        answer_start_token: str = "",
        answer_end_token: str = "",
        scoring_params: dict = {
            "scoring_method": "exact_match",
            "case_sensitive": False,
        },
        **options,
    ) -> Dataset:
        """Run LLMCurate on the input data

        Args:
            column_to_curate (str): Column name in `data` with the text that needs to be curated
            data (Union[pd.DataFrame, Dataset]): Input data for LLM based curation
            ds_column_mapping (dict, optional): Mapping of entities to be used in the LLM prompt and the corresponding columns in the input data. Defaults to {}.
            prompt_variants (List[str], optional): List of different LLM prompts to be used to curate the labels under `column_to_curate`. Defaults to [''].
            skip_llm_inference (bool, optional): Indicator variable to prevent re-running LLM inference. Set to `True` if artifacts from the previous run of LLMCurate needs to be reused. Else `False`. Defaults to False.
            llm_response_cleaned_column_list (list, optional): Names of the columns that will contain LLM predictions for each input prompt in `prompt_variants`. Defaults to ['reference_prediction'].
            return_scores (bool, optional): Indicator variable set to `True` if label confidence scores are to be computed for each label under `column_to_curate`. Defaults to True.
            answer_start_token (str, optional): Token that indicates the start of answer generation. Defaults to ''
            answer_end_token (str, optional): Token that indicates the end of answer generation. Defaults to ''
            scoring_params (dict, optional): Parameters related to util function `compute_selfensembling_confidence_score` to compute confidence scores of `column_to_curate`

        Returns:
            Dataset: Input dataset with reference responses. If `return_scores=True`, then input dataset with reference responses and confidence scores.
        """
        if not skip_llm_inference:
            empty_string_col_list = _validate_run_params(
                data,
                column_to_curate,
                ds_column_mapping,
                prompt_variants,
                llm_response_cleaned_column_list,
            )

            if len(empty_string_col_list) > 0:
                logger.warning(
                    "Found empty string(s) in the input data under column(s) {empty_string_col_list}"
                )

            logger.info(
                f"Running the LLM to generate the {len(prompt_variants)} reference responses using `prompt_variants`.."
            )
            ds_ensemble = None

            model = self.model
            tokenizer = self.tokenizer

            for index, prompt_template_prefix in enumerate(prompt_variants):
                proposed_answer_col_name = llm_response_cleaned_column_list[index]
                ds = run_LLM(
                    data,
                    model,
                    tokenizer,
                    ds_column_mapping=ds_column_mapping,
                    prompt_template_prefix=prompt_template_prefix,
                    answer_start_token=answer_start_token,
                    answer_end_token=answer_end_token,
                    llm_response_cleaned_col_name=proposed_answer_col_name,
                    random_state=self.random_state,
                    **options,
                )

                if not ds_ensemble:
                    ds_ensemble = ds
                else:
                    ds_ensemble = ds_ensemble.add_column(
                        proposed_answer_col_name, ds[proposed_answer_col_name]
                    )
            self.ds_ensemble = ds_ensemble

        if return_scores:
            if skip_llm_inference:
                if (
                    isinstance(data, pd.DataFrame)
                    or ds_column_mapping
                    or prompt_variants
                ):
                    logger.warning(
                        "Ignoring params `data`, `ds_column_mapping` and `prompt_variants` since `skip_llm_inference` is set to `True`"
                    )

            _empty_ds_ensemble_handler(len(self.ds_ensemble) == 0, skip_llm_inference)

            logger.info(
                "Computing confidence scores using the LLM reference responses.."
            )
            self.ds_ensemble = self.ds_ensemble.map(
                compute_selfensembling_confidence_score,
                fn_kwargs={
                    "target_column": column_to_curate,
                    "reference_column_list": llm_response_cleaned_column_list,
                    **scoring_params,
                },
            )

        return self.ds_ensemble

`run(column_to_curate, data=None, ds_column_mapping={}, prompt_variants=[''], skip_llm_inference=False, llm_response_cleaned_column_list=['reference_prediction'], return_scores=True, answer_start_token='', answer_end_token='', scoring_params={'scoring_method': 'exact_match', 'case_sensitive': False}, **options)`

Run LLMCurate on the input data

Parameters:

Name	Type	Description	Default
`column_to_curate`	`str`	Column name in `data` with the text that needs to be curated	required
`data`	`Union[DataFrame, Dataset]`	Input data for LLM based curation	`None`
`ds_column_mapping`	`dict`	Mapping of entities to be used in the LLM prompt and the corresponding columns in the input data. Defaults to {}.	`{}`
`prompt_variants`	`List[str]`	List of different LLM prompts to be used to curate the labels under `column_to_curate`. Defaults to [''].	`['']`
`skip_llm_inference`	`bool`	Indicator variable to prevent re-running LLM inference. Set to `True` if artifacts from the previous run of LLMCurate needs to be reused. Else `False`. Defaults to False.	`False`
`llm_response_cleaned_column_list`	`list`	Names of the columns that will contain LLM predictions for each input prompt in `prompt_variants`. Defaults to ['reference_prediction'].	`['reference_prediction']`
`return_scores`	`bool`	Indicator variable set to `True` if label confidence scores are to be computed for each label under `column_to_curate`. Defaults to True.	`True`
`answer_start_token`	`str`	Token that indicates the start of answer generation. Defaults to ''	`''`
`answer_end_token`	`str`	Token that indicates the end of answer generation. Defaults to ''	`''`
`scoring_params`	`dict`	Parameters related to util function `compute_selfensembling_confidence_score` to compute confidence scores of `column_to_curate`	`{'scoring_method': 'exact_match', 'case_sensitive': False}`

Returns:

Name	Type	Description
`Dataset`	`Dataset`	Input dataset with reference responses. If `return_scores=True`, then input dataset with reference responses and confidence scores.

Source code in dqc/llm.py

def run(
    self,
    column_to_curate: str,
    data: Union[pd.DataFrame, Dataset] = None,
    ds_column_mapping: dict = {},
    prompt_variants: List[str] = [""],
    skip_llm_inference: bool = False,
    llm_response_cleaned_column_list: List[str] = ["reference_prediction"],
    return_scores: bool = True,
    answer_start_token: str = "",
    answer_end_token: str = "",
    scoring_params: dict = {
        "scoring_method": "exact_match",
        "case_sensitive": False,
    },
    **options,
) -> Dataset:
    """Run LLMCurate on the input data

    Args:
        column_to_curate (str): Column name in `data` with the text that needs to be curated
        data (Union[pd.DataFrame, Dataset]): Input data for LLM based curation
        ds_column_mapping (dict, optional): Mapping of entities to be used in the LLM prompt and the corresponding columns in the input data. Defaults to {}.
        prompt_variants (List[str], optional): List of different LLM prompts to be used to curate the labels under `column_to_curate`. Defaults to [''].
        skip_llm_inference (bool, optional): Indicator variable to prevent re-running LLM inference. Set to `True` if artifacts from the previous run of LLMCurate needs to be reused. Else `False`. Defaults to False.
        llm_response_cleaned_column_list (list, optional): Names of the columns that will contain LLM predictions for each input prompt in `prompt_variants`. Defaults to ['reference_prediction'].
        return_scores (bool, optional): Indicator variable set to `True` if label confidence scores are to be computed for each label under `column_to_curate`. Defaults to True.
        answer_start_token (str, optional): Token that indicates the start of answer generation. Defaults to ''
        answer_end_token (str, optional): Token that indicates the end of answer generation. Defaults to ''
        scoring_params (dict, optional): Parameters related to util function `compute_selfensembling_confidence_score` to compute confidence scores of `column_to_curate`

    Returns:
        Dataset: Input dataset with reference responses. If `return_scores=True`, then input dataset with reference responses and confidence scores.
    """
    if not skip_llm_inference:
        empty_string_col_list = _validate_run_params(
            data,
            column_to_curate,
            ds_column_mapping,
            prompt_variants,
            llm_response_cleaned_column_list,
        )

        if len(empty_string_col_list) > 0:
            logger.warning(
                "Found empty string(s) in the input data under column(s) {empty_string_col_list}"
            )

        logger.info(
            f"Running the LLM to generate the {len(prompt_variants)} reference responses using `prompt_variants`.."
        )
        ds_ensemble = None

        model = self.model
        tokenizer = self.tokenizer

        for index, prompt_template_prefix in enumerate(prompt_variants):
            proposed_answer_col_name = llm_response_cleaned_column_list[index]
            ds = run_LLM(
                data,
                model,
                tokenizer,
                ds_column_mapping=ds_column_mapping,
                prompt_template_prefix=prompt_template_prefix,
                answer_start_token=answer_start_token,
                answer_end_token=answer_end_token,
                llm_response_cleaned_col_name=proposed_answer_col_name,
                random_state=self.random_state,
                **options,
            )

            if not ds_ensemble:
                ds_ensemble = ds
            else:
                ds_ensemble = ds_ensemble.add_column(
                    proposed_answer_col_name, ds[proposed_answer_col_name]
                )
        self.ds_ensemble = ds_ensemble

    if return_scores:
        if skip_llm_inference:
            if (
                isinstance(data, pd.DataFrame)
                or ds_column_mapping
                or prompt_variants
            ):
                logger.warning(
                    "Ignoring params `data`, `ds_column_mapping` and `prompt_variants` since `skip_llm_inference` is set to `True`"
                )

        _empty_ds_ensemble_handler(len(self.ds_ensemble) == 0, skip_llm_inference)

        logger.info(
            "Computing confidence scores using the LLM reference responses.."
        )
        self.ds_ensemble = self.ds_ensemble.map(
            compute_selfensembling_confidence_score,
            fn_kwargs={
                "target_column": column_to_curate,
                "reference_column_list": llm_response_cleaned_column_list,
                **scoring_params,
            },
        )

    return self.ds_ensemble