LUQ API Documentation

Methods

`BaseUQModel`

Source code in luq/methods/base_uq_model.py

class BaseUQModel:
    def compute_sequence_probability(
        self, logprobs: torch.Tensor, seq_prob_mode: SeqProbMode = SeqProbMode.PROD
    ) -> float:
        """
        Computes the probability of a response sequence from log-probabilities.

        Args:
            logprobs (torch.Tensor): A tensor containing log-probabilities of each token in the sequence.
            seq_prob_mode (SeqProbMode, optional): The method to compute the sequence probability.
                Options are SeqProbMode.PROD for product and SeqProbMode.AVG for average.
                Defaults to SeqProbMode.PROD.

        Returns:
            float: The computed sequence probability.

        Raises:
            ValueError: If an unknown `seq_prob_mode` is provided.
        """
        token_probs = torch.exp(logprobs)  # Convert logits to probabilities
        if seq_prob_mode == SeqProbMode.PROD:
            return torch.prod(token_probs).item()
        elif seq_prob_mode == SeqProbMode.AVG:
            return torch.mean(token_probs).item()
        else:
            raise ValueError(f"Unknown seq_prob_mode: {seq_prob_mode}")

    def normalize_sequence_probs(
        self, probs: List[float], tolerance: float = 1e-9
    ) -> List[float]:
        """
        Normalizes a list of sequence probabilities so they sum to 1.

        Args:
            probs (List[float]): A list of raw sequence probabilities.
            tolerance (float, optional): A small threshold below which the sum is considered zero
                to avoid division by zero. Defaults to 1e-9.

        Returns:
            List[float]: A list of normalized probabilities summing to 1.
        """
        z = sum(probs)
        if abs(z) < tolerance:
            return [1.0 / len(probs)] * len(probs)
        return [p / z for p in probs]

    def estimate_uncertainty(self, prompt: str, *args, **kwargs) -> float:
        """
        Estimates the uncertainty for a given prompt.

        Args:
            prompt (str): The input prompt to estimate uncertainty for.
            *args: Additional positional arguments.
            **kwargs: Additional keyword arguments.

        Returns:
            float: The estimated uncertainty value.

        Raises:
            NotImplementedError: This method must be implemented in a subclass.
        """
        raise NotImplementedError("method get_uncertainty is not implemented")

`compute_sequence_probability(logprobs, seq_prob_mode=SeqProbMode.PROD)`

Computes the probability of a response sequence from log-probabilities.

Parameters:

Name	Type	Description	Default
`logprobs`	`Tensor`	A tensor containing log-probabilities of each token in the sequence.	required
`seq_prob_mode`	`SeqProbMode`	The method to compute the sequence probability. Options are SeqProbMode.PROD for product and SeqProbMode.AVG for average. Defaults to SeqProbMode.PROD.	`PROD`

Returns:

Name	Type	Description
`float`	`float`	The computed sequence probability.

Raises:

Type	Description
`ValueError`	If an unknown `seq_prob_mode` is provided.

Source code in luq/methods/base_uq_model.py

def compute_sequence_probability(
    self, logprobs: torch.Tensor, seq_prob_mode: SeqProbMode = SeqProbMode.PROD
) -> float:
    """
    Computes the probability of a response sequence from log-probabilities.

    Args:
        logprobs (torch.Tensor): A tensor containing log-probabilities of each token in the sequence.
        seq_prob_mode (SeqProbMode, optional): The method to compute the sequence probability.
            Options are SeqProbMode.PROD for product and SeqProbMode.AVG for average.
            Defaults to SeqProbMode.PROD.

    Returns:
        float: The computed sequence probability.

    Raises:
        ValueError: If an unknown `seq_prob_mode` is provided.
    """
    token_probs = torch.exp(logprobs)  # Convert logits to probabilities
    if seq_prob_mode == SeqProbMode.PROD:
        return torch.prod(token_probs).item()
    elif seq_prob_mode == SeqProbMode.AVG:
        return torch.mean(token_probs).item()
    else:
        raise ValueError(f"Unknown seq_prob_mode: {seq_prob_mode}")

`estimate_uncertainty(prompt, *args, **kwargs)`

Estimates the uncertainty for a given prompt.

Parameters:

Name	Type	Description	Default
`prompt`	`str`	The input prompt to estimate uncertainty for.	required
`*args`		Additional positional arguments.	`()`
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Name	Type	Description
`float`	`float`	The estimated uncertainty value.

Raises:

Type	Description
`NotImplementedError`	This method must be implemented in a subclass.

Source code in luq/methods/base_uq_model.py

def estimate_uncertainty(self, prompt: str, *args, **kwargs) -> float:
    """
    Estimates the uncertainty for a given prompt.

    Args:
        prompt (str): The input prompt to estimate uncertainty for.
        *args: Additional positional arguments.
        **kwargs: Additional keyword arguments.

    Returns:
        float: The estimated uncertainty value.

    Raises:
        NotImplementedError: This method must be implemented in a subclass.
    """
    raise NotImplementedError("method get_uncertainty is not implemented")

`normalize_sequence_probs(probs, tolerance=1e-09)`

Normalizes a list of sequence probabilities so they sum to 1.

Parameters:

Name	Type	Description	Default
`probs`	`List[float]`	A list of raw sequence probabilities.	required
`tolerance`	`float`	A small threshold below which the sum is considered zero to avoid division by zero. Defaults to 1e-9.	`1e-09`

Returns:

Type	Description
`List[float]`	List[float]: A list of normalized probabilities summing to 1.

Source code in luq/methods/base_uq_model.py

def normalize_sequence_probs(
    self, probs: List[float], tolerance: float = 1e-9
) -> List[float]:
    """
    Normalizes a list of sequence probabilities so they sum to 1.

    Args:
        probs (List[float]): A list of raw sequence probabilities.
        tolerance (float, optional): A small threshold below which the sum is considered zero
            to avoid division by zero. Defaults to 1e-9.

    Returns:
        List[float]: A list of normalized probabilities summing to 1.
    """
    z = sum(probs)
    if abs(z) < tolerance:
        return [1.0 / len(probs)] * len(probs)
    return [p / z for p in probs]

`KernelLanguageEntropyEstimator`

Bases: BaseUQModel

Source code in luq/methods/kernel_language_entropy.py

class KernelLanguageEntropyEstimator(BaseUQModel):
    def __init__(self):
        """Initializes the KernelLanguageEntropyEstimator."""
        super().__init__()

    def compute_entropy(
        self,
        kernel: torch.Tensor,
        normalize: bool = False,
    ) -> float:
        """Computes the von Neumann entropy of a given unit-trace kernel matrix (semantic kernel matrix).

        Args:
            kernel (torch.Tensor): The kernel matrix.
            normalize (bool, optional): If True, normalize the kernel before computing entropy. Defaults to False.

        Returns:
            float: The computed Kernel Language Entropy.
        """
        if normalize:
            kernel = normalize_kernel(kernel)
        return von_neumann_entropy(kernel)

    def get_kernel(
        self,
        samples: LLMSamples,
        kernel_type: KernelType | None = None,
        construct_kernel: T.Callable | None = None,
        nli_model: NLIWrapper | None = None,
        nli_table: NLITable | None = None,
    ) -> torch.Tensor:
        """Constructs a kernel matrix from language model samples.

        Either `kernel_type` or `construct_kernel` must be provided, but not both.

        Args:
            samples (LLMSamples): The language model samples.
            kernel_type (KernelType | None, optional): The predefined kernel type to use. Defaults to None.
            construct_kernel (Callable | None, optional): A custom kernel construction function. Defaults to None.
            nli_model (NLIWrapper | None, optional): A model for natural language inference. Defaults to None.
            nli_table (NLITable | None, optional): A precomputed NLI similarity table. Defaults to None.

        Returns:
            torch.Tensor: The normalized kernel matrix.

        Raises:
            ValueError: If both or neither `kernel_type` and `construct_kernel` are provided.
            ValueError: If an unknown kernel type is specified.
        """
        if kernel_type is not None and construct_kernel is not None:
            raise ValueError(
                "Only one of `kernel_type` and `construct_kernel` should be specified"
            )
        if kernel_type is None and construct_kernel is None:
            raise ValueError(
                "Either `kernel_type` or `construct_kernel` should be specified"
            )

        if kernel_type is not None:
            kernel = None
            if kernel_type == KernelType.HEAT:
                # todo: calculate heat kernel
                pass
            elif kernel_type == KernelType.MATERN:
                # todo: calculate Matern kernel
                pass
            else:
                raise ValueError(f"Unknown kernel type: {kernel_type}")
        else:
            kernel = construct_kernel(samples)
        kernel = normalize_kernel(kernel)
        return kernel

    def estimate_uncertainty(
        self,
        samples: LLMSamples,
        seq_prob_mode: SeqProbMode = SeqProbMode.PROD,
        kernel_type: KernelType = KernelType.HEAT,
        nli_model: NLIWrapper | None = None,
        nli_table: NLITable | None = None,
        construct_kernel: T.Callable | None = None,
        **kwargs,
    ) -> float:
        """Estimates uncertainty by computing the von Neumann entropy of a semantic similarity kernel.

        One of `nli_model` or `nli_table` must be provided to compute the semantic similarity.

        Args:
            samples (LLMSamples): The language model samples to analyze.
            seq_prob_mode (SeqProbMode, optional): Mode for sequence probability aggregation. Defaults to SeqProbMode.PROD.
            kernel_type (KernelType, optional): The predefined kernel type to use if `construct_kernel` is not provided. Defaults to KernelType.HEAT.
            nli_model (NLIWrapper | None, optional): A model for natural language inference. Defaults to None.
            nli_table (NLITable | None, optional): A precomputed NLI similarity table. Defaults to None.
            construct_kernel (Callable | None, optional): A custom kernel construction function. Defaults to None.
            **kwargs: Additional keyword arguments.

        Returns:
            float: The estimated uncertainty value.

        Raises:
            ValueError: If neither or both `nli_model` and `nli_table` are provided.
        """
        # validation
        if nli_model is None and nli_table is None:
            raise ValueError("Either `nli_model` or `nli_table` should be provided")

        if nli_model is not None and nli_table is not None:
            raise ValueError(
                "Only one of `nli_model` and `nli_table` should be provided"
            )

        kernel = self.get_kernel(
            samples,
            kernel_type=kernel_type,
            construct_kernel=construct_kernel,
            nli_model=nli_model,
            nli_table=nli_table,
        )
        # Compute entropy over clusters
        return self.compute_entropy(kernel)

`init()`

Initializes the KernelLanguageEntropyEstimator.

Source code in luq/methods/kernel_language_entropy.py

def __init__(self):
    """Initializes the KernelLanguageEntropyEstimator."""
    super().__init__()

`compute_entropy(kernel, normalize=False)`

Computes the von Neumann entropy of a given unit-trace kernel matrix (semantic kernel matrix).

Parameters:

Name	Type	Description	Default
`kernel`	`Tensor`	The kernel matrix.	required
`normalize`	`bool`	If True, normalize the kernel before computing entropy. Defaults to False.	`False`

Returns:

Name	Type	Description
`float`	`float`	The computed Kernel Language Entropy.

Source code in luq/methods/kernel_language_entropy.py

def compute_entropy(
    self,
    kernel: torch.Tensor,
    normalize: bool = False,
) -> float:
    """Computes the von Neumann entropy of a given unit-trace kernel matrix (semantic kernel matrix).

    Args:
        kernel (torch.Tensor): The kernel matrix.
        normalize (bool, optional): If True, normalize the kernel before computing entropy. Defaults to False.

    Returns:
        float: The computed Kernel Language Entropy.
    """
    if normalize:
        kernel = normalize_kernel(kernel)
    return von_neumann_entropy(kernel)

`estimate_uncertainty(samples, seq_prob_mode=SeqProbMode.PROD, kernel_type=KernelType.HEAT, nli_model=None, nli_table=None, construct_kernel=None, **kwargs)`

Estimates uncertainty by computing the von Neumann entropy of a semantic similarity kernel.

One of nli_model or nli_table must be provided to compute the semantic similarity.

Parameters:

Name	Type	Description	Default
`samples`	`LLMSamples`	The language model samples to analyze.	required
`seq_prob_mode`	`SeqProbMode`	Mode for sequence probability aggregation. Defaults to SeqProbMode.PROD.	`PROD`
`kernel_type`	`KernelType`	The predefined kernel type to use if `construct_kernel` is not provided. Defaults to KernelType.HEAT.	`HEAT`
`nli_model`	`NLIWrapper \| None`	A model for natural language inference. Defaults to None.	`None`
`nli_table`	`NLITable \| None`	A precomputed NLI similarity table. Defaults to None.	`None`
`construct_kernel`	`Callable \| None`	A custom kernel construction function. Defaults to None.	`None`
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Name	Type	Description
`float`	`float`	The estimated uncertainty value.

Raises:

Type	Description
`ValueError`	If neither or both `nli_model` and `nli_table` are provided.

Source code in luq/methods/kernel_language_entropy.py

def estimate_uncertainty(
    self,
    samples: LLMSamples,
    seq_prob_mode: SeqProbMode = SeqProbMode.PROD,
    kernel_type: KernelType = KernelType.HEAT,
    nli_model: NLIWrapper | None = None,
    nli_table: NLITable | None = None,
    construct_kernel: T.Callable | None = None,
    **kwargs,
) -> float:
    """Estimates uncertainty by computing the von Neumann entropy of a semantic similarity kernel.

    One of `nli_model` or `nli_table` must be provided to compute the semantic similarity.

    Args:
        samples (LLMSamples): The language model samples to analyze.
        seq_prob_mode (SeqProbMode, optional): Mode for sequence probability aggregation. Defaults to SeqProbMode.PROD.
        kernel_type (KernelType, optional): The predefined kernel type to use if `construct_kernel` is not provided. Defaults to KernelType.HEAT.
        nli_model (NLIWrapper | None, optional): A model for natural language inference. Defaults to None.
        nli_table (NLITable | None, optional): A precomputed NLI similarity table. Defaults to None.
        construct_kernel (Callable | None, optional): A custom kernel construction function. Defaults to None.
        **kwargs: Additional keyword arguments.

    Returns:
        float: The estimated uncertainty value.

    Raises:
        ValueError: If neither or both `nli_model` and `nli_table` are provided.
    """
    # validation
    if nli_model is None and nli_table is None:
        raise ValueError("Either `nli_model` or `nli_table` should be provided")

    if nli_model is not None and nli_table is not None:
        raise ValueError(
            "Only one of `nli_model` and `nli_table` should be provided"
        )

    kernel = self.get_kernel(
        samples,
        kernel_type=kernel_type,
        construct_kernel=construct_kernel,
        nli_model=nli_model,
        nli_table=nli_table,
    )
    # Compute entropy over clusters
    return self.compute_entropy(kernel)

`get_kernel(samples, kernel_type=None, construct_kernel=None, nli_model=None, nli_table=None)`

Constructs a kernel matrix from language model samples.

Either kernel_type or construct_kernel must be provided, but not both.

Parameters:

Name	Type	Description	Default
`samples`	`LLMSamples`	The language model samples.	required
`kernel_type`	`KernelType \| None`	The predefined kernel type to use. Defaults to None.	`None`
`construct_kernel`	`Callable \| None`	A custom kernel construction function. Defaults to None.	`None`
`nli_model`	`NLIWrapper \| None`	A model for natural language inference. Defaults to None.	`None`
`nli_table`	`NLITable \| None`	A precomputed NLI similarity table. Defaults to None.	`None`

Returns:

Type	Description
`Tensor`	torch.Tensor: The normalized kernel matrix.

Raises:

Type	Description
`ValueError`	If both or neither `kernel_type` and `construct_kernel` are provided.
`ValueError`	If an unknown kernel type is specified.

Source code in luq/methods/kernel_language_entropy.py

def get_kernel(
    self,
    samples: LLMSamples,
    kernel_type: KernelType | None = None,
    construct_kernel: T.Callable | None = None,
    nli_model: NLIWrapper | None = None,
    nli_table: NLITable | None = None,
) -> torch.Tensor:
    """Constructs a kernel matrix from language model samples.

    Either `kernel_type` or `construct_kernel` must be provided, but not both.

    Args:
        samples (LLMSamples): The language model samples.
        kernel_type (KernelType | None, optional): The predefined kernel type to use. Defaults to None.
        construct_kernel (Callable | None, optional): A custom kernel construction function. Defaults to None.
        nli_model (NLIWrapper | None, optional): A model for natural language inference. Defaults to None.
        nli_table (NLITable | None, optional): A precomputed NLI similarity table. Defaults to None.

    Returns:
        torch.Tensor: The normalized kernel matrix.

    Raises:
        ValueError: If both or neither `kernel_type` and `construct_kernel` are provided.
        ValueError: If an unknown kernel type is specified.
    """
    if kernel_type is not None and construct_kernel is not None:
        raise ValueError(
            "Only one of `kernel_type` and `construct_kernel` should be specified"
        )
    if kernel_type is None and construct_kernel is None:
        raise ValueError(
            "Either `kernel_type` or `construct_kernel` should be specified"
        )

    if kernel_type is not None:
        kernel = None
        if kernel_type == KernelType.HEAT:
            # todo: calculate heat kernel
            pass
        elif kernel_type == KernelType.MATERN:
            # todo: calculate Matern kernel
            pass
        else:
            raise ValueError(f"Unknown kernel type: {kernel_type}")
    else:
        kernel = construct_kernel(samples)
    kernel = normalize_kernel(kernel)
    return kernel

`KernelType`

Bases: Enum

Enumeration of supported kernel types.

Attributes:

Name	Type	Description
`HEAT`	`str`	Heat kernel type.
`MATERN`	`str`	Matern kernel type.

Source code in luq/methods/kernel_utils.py

class KernelType(Enum):
    """Enumeration of supported kernel types.

    Attributes:
        HEAT (str): Heat kernel type.
        MATERN (str): Matern kernel type.
    """
    HEAT: str = "heat"
    MATERN: str = "matern"

`LLMOutput` `dataclass`

Represents the output of a language model.

Attributes:

Name	Type	Description
`answer`	`str`	The generated text answer from the language model.
`logprobs`	`Tensor \| None`	Optional tensor containing the log probabilities associated with the generated tokens.

Source code in luq/models/llm.py

@dataclass
class LLMOutput:
    """
    Represents the output of a language model.

    Attributes:
        answer (str): The generated text answer from the language model.
        logprobs (torch.Tensor | None): Optional tensor containing the log probabilities
            associated with the generated tokens.
    """
    answer: str
    logprobs: torch.Tensor | None = None  # list of logprobs

`LLMSamples` `dataclass`

Contains multiple samples generated by a language model along with metadata.

Attributes:

Name	Type	Description
`samples`	`List[LLMOutput]`	A list of multiple LLMOutput samples.
`answer`	`LLMOutput`	The selected or final answer output.
`params`	`Dict[str, Any]`	Parameters used to generate the samples.

Source code in luq/models/llm.py

@dataclass
class LLMSamples:
    """
    Contains multiple samples generated by a language model along with metadata.

    Attributes:
        samples (List[LLMOutput]): A list of multiple LLMOutput samples.
        answer (LLMOutput): The selected or final answer output.
        params (Dict[str, Any]): Parameters used to generate the samples.
    """
    samples: T.List[LLMOutput]
    answer: LLMOutput
    params: T.Dict[str, T.Any]

    def __len__(self) -> int:
        """
        Returns the number of samples generated.

        Returns:
            int: The count of samples.
        """
        return len(self.samples)

`len()`

Returns the number of samples generated.

Returns:

Name	Type	Description
`int`	`int`	The count of samples.

Source code in luq/models/llm.py

def __len__(self) -> int:
    """
    Returns the number of samples generated.

    Returns:
        int: The count of samples.
    """
    return len(self.samples)

`LLMWrapper`

Source code in luq/models/llm.py

class LLMWrapper:
    def __call__(self, *args, **kwargs) -> LLMOutput:
        """
        Abstract base wrapper for language model interfaces.

        This class is meant to be subclassed to implement specific LLM calls.
        """
        raise NotImplementedError("__call__ should be implemented for your LLM")

`call(*args, **kwargs)`

Abstract base wrapper for language model interfaces.

This class is meant to be subclassed to implement specific LLM calls.

Source code in luq/models/llm.py

def __call__(self, *args, **kwargs) -> LLMOutput:
    """
    Abstract base wrapper for language model interfaces.

    This class is meant to be subclassed to implement specific LLM calls.
    """
    raise NotImplementedError("__call__ should be implemented for your LLM")

`MaxProbabilityEstimator`

Bases: BaseUQModel

Uncertainty estimator that uses the probability of the most likely sequence.

This class estimates uncertainty by computing the probability of each sequence in a set of samples, and returning one minus the maximum probability, which serves as a measure of uncertainty.

Source code in luq/methods/max_probability.py

class MaxProbabilityEstimator(BaseUQModel):
    """Uncertainty estimator that uses the probability of the most likely sequence.

    This class estimates uncertainty by computing the probability of each sequence in a set of samples,
    and returning one minus the maximum probability, which serves as a measure of uncertainty.
    """
    def estimate_uncertainty(
        self,
        samples: T.List[LLMOutput],
        seq_prob_mode: SeqProbMode = SeqProbMode.PROD,
        **kwargs,
    ) -> float:
        """Estimate uncertainty from a list of LLM output samples.

        This method calculates the sequence probability for each sample using the specified
        sequence probability mode and returns an uncertainty score equal to `1 - max(sequence_probs)`.

        Args:
            samples (List[LLMOutput]): A list of language model outputs with associated log probabilities.
            seq_prob_mode (SeqProbMode, optional): Mode for aggregating token probabilities into
                sequence probabilities (e.g., product or average). Defaults to `SeqProbMode.PROD`.
            **kwargs: Additional keyword arguments (unused here but kept for compatibility).

        Returns:
            float: Uncertainty score, where higher values indicate more uncertainty.
        """
        assert all(s.logprobs is not None for s in samples.samples)

        logit_samples = [s.logprobs for s in samples.samples]
        sequence_probs = [
            self.compute_sequence_probability(logits, seq_prob_mode)
            for logits in logit_samples
        ]
        sequence_probs = self.normalize_sequence_probs(sequence_probs)
        return 1 - max(sequence_probs)

`estimate_uncertainty(samples, seq_prob_mode=SeqProbMode.PROD, **kwargs)`

Estimate uncertainty from a list of LLM output samples.

This method calculates the sequence probability for each sample using the specified sequence probability mode and returns an uncertainty score equal to 1 - max(sequence_probs).

Parameters:

Name	Type	Description	Default
`samples`	`List[LLMOutput]`	A list of language model outputs with associated log probabilities.	required
`seq_prob_mode`	`SeqProbMode`	Mode for aggregating token probabilities into sequence probabilities (e.g., product or average). Defaults to `SeqProbMode.PROD`.	`PROD`
`**kwargs`		Additional keyword arguments (unused here but kept for compatibility).	`{}`

Returns:

Name	Type	Description
`float`	`float`	Uncertainty score, where higher values indicate more uncertainty.

Source code in luq/methods/max_probability.py

def estimate_uncertainty(
    self,
    samples: T.List[LLMOutput],
    seq_prob_mode: SeqProbMode = SeqProbMode.PROD,
    **kwargs,
) -> float:
    """Estimate uncertainty from a list of LLM output samples.

    This method calculates the sequence probability for each sample using the specified
    sequence probability mode and returns an uncertainty score equal to `1 - max(sequence_probs)`.

    Args:
        samples (List[LLMOutput]): A list of language model outputs with associated log probabilities.
        seq_prob_mode (SeqProbMode, optional): Mode for aggregating token probabilities into
            sequence probabilities (e.g., product or average). Defaults to `SeqProbMode.PROD`.
        **kwargs: Additional keyword arguments (unused here but kept for compatibility).

    Returns:
        float: Uncertainty score, where higher values indicate more uncertainty.
    """
    assert all(s.logprobs is not None for s in samples.samples)

    logit_samples = [s.logprobs for s in samples.samples]
    sequence_probs = [
        self.compute_sequence_probability(logits, seq_prob_mode)
        for logits in logit_samples
    ]
    sequence_probs = self.normalize_sequence_probs(sequence_probs)
    return 1 - max(sequence_probs)

`NLIWrapper`

Abstract wrapper class for Natural Language Inference (NLI) models.

Source code in luq/models/nli.py

class NLIWrapper:
    """
    Abstract wrapper class for Natural Language Inference (NLI) models.
    """
    def __call__(*args, **kwargs) -> T.List[NLIOutput]:
        """
        Runs the NLI model on input arguments.

        Returns:
            List[NLIOutput]: A list of NLI model outputs.

        Raises:
            NotImplementedError: If not implemented in a subclass.
        """
        raise NotImplementedError("NLI model should implement `__call__` method.")

`call(*args, **kwargs)`

Runs the NLI model on input arguments.

Returns:

Type	Description
`List[NLIOutput]`	List[NLIOutput]: A list of NLI model outputs.

Raises:

Type	Description
`NotImplementedError`	If not implemented in a subclass.

Source code in luq/models/nli.py

def __call__(*args, **kwargs) -> T.List[NLIOutput]:
    """
    Runs the NLI model on input arguments.

    Returns:
        List[NLIOutput]: A list of NLI model outputs.

    Raises:
        NotImplementedError: If not implemented in a subclass.
    """
    raise NotImplementedError("NLI model should implement `__call__` method.")

`PredictiveEntropyEstimator`

Bases: BaseUQModel

Source code in luq/methods/predictive_entropy.py

class PredictiveEntropyEstimator(BaseUQModel):
    def generate_logits(self, prompt: str, num_samples: int = 10) -> T.List:
        """Generates multiple responses from the language model and extracts their logits.

        Args:
            prompt (str): The input prompt for the language model.
            num_samples (int, optional): Number of samples to generate. Defaults to 10.

        Returns:
            List: A list of logit sequences corresponding to the generated samples.

        Raises:
            ValueError: If the internal language model is not an instance of LLMWrapper.
        """
        logit_samples = []

        for _ in range(num_samples):
            if isinstance(self._llm, LLMWrapper):
                response = self._llm(prompt)
            else:
                raise ValueError(
                    f"Cannot compute logits LogitUncertaintyQuantification for {type(self._llm)}"
                )
            logit_samples.append(response.logits)

        return logit_samples

    def compute_entropy(self, sequence_probs: torch.Tensor | List) -> float:
        """Computes the entropy over a list of sequence probabilities.

        Args:
            sequence_probs (list or torch.Tensor): List or tensor of sequence probabilities.

        Returns:
            float: The entropy value computed from the normalized probability distribution.
        """
        if not isinstance(sequence_probs, torch.Tensor):
            sequence_probs = torch.tensor(sequence_probs)

        sequence_probs /= sum(
            sequence_probs
        )  # Normalize to form a probability distribution
        return entropy(sequence_probs)

    def estimate_uncertainty(
        self,
        samples: T.List[LLMOutput],
        seq_prob_mode: SeqProbMode = SeqProbMode.PROD,
        **kwargs,
    ) -> float:
        """
        Uncertainty is estimated by computing the entropy of probabilities obtained from sampled sequences.

        :param prompt: The input prompt for LLM.
        :param seq_prob_mode: Describes how token probabilities are translated into sequence probabilities
        :return: entropy score
        """
        assert all(s.logprobs is not None for s in samples.samples)

        logit_samples = [s.logprobs for s in samples.samples]
        sequence_probs = [
            self.compute_sequence_probability(logits, seq_prob_mode)
            for logits in logit_samples
        ]
        entropy_value = self.compute_entropy(sequence_probs)

        return entropy_value

`compute_entropy(sequence_probs)`

Computes the entropy over a list of sequence probabilities.

Parameters:

Name	Type	Description	Default
`sequence_probs`	`list or Tensor`	List or tensor of sequence probabilities.	required

Returns:

Name	Type	Description
`float`	`float`	The entropy value computed from the normalized probability distribution.

Source code in luq/methods/predictive_entropy.py

def compute_entropy(self, sequence_probs: torch.Tensor | List) -> float:
    """Computes the entropy over a list of sequence probabilities.

    Args:
        sequence_probs (list or torch.Tensor): List or tensor of sequence probabilities.

    Returns:
        float: The entropy value computed from the normalized probability distribution.
    """
    if not isinstance(sequence_probs, torch.Tensor):
        sequence_probs = torch.tensor(sequence_probs)

    sequence_probs /= sum(
        sequence_probs
    )  # Normalize to form a probability distribution
    return entropy(sequence_probs)

`estimate_uncertainty(samples, seq_prob_mode=SeqProbMode.PROD, **kwargs)`

Uncertainty is estimated by computing the entropy of probabilities obtained from sampled sequences.

:param prompt: The input prompt for LLM. :param seq_prob_mode: Describes how token probabilities are translated into sequence probabilities :return: entropy score

Source code in luq/methods/predictive_entropy.py

def estimate_uncertainty(
    self,
    samples: T.List[LLMOutput],
    seq_prob_mode: SeqProbMode = SeqProbMode.PROD,
    **kwargs,
) -> float:
    """
    Uncertainty is estimated by computing the entropy of probabilities obtained from sampled sequences.

    :param prompt: The input prompt for LLM.
    :param seq_prob_mode: Describes how token probabilities are translated into sequence probabilities
    :return: entropy score
    """
    assert all(s.logprobs is not None for s in samples.samples)

    logit_samples = [s.logprobs for s in samples.samples]
    sequence_probs = [
        self.compute_sequence_probability(logits, seq_prob_mode)
        for logits in logit_samples
    ]
    entropy_value = self.compute_entropy(sequence_probs)

    return entropy_value

`generate_logits(prompt, num_samples=10)`

Generates multiple responses from the language model and extracts their logits.

Parameters:

Name	Type	Description	Default
`prompt`	`str`	The input prompt for the language model.	required
`num_samples`	`int`	Number of samples to generate. Defaults to 10.	`10`

Returns:

Name	Type	Description
`List`	`List`	A list of logit sequences corresponding to the generated samples.

Raises:

Type	Description
`ValueError`	If the internal language model is not an instance of LLMWrapper.

Source code in luq/methods/predictive_entropy.py

def generate_logits(self, prompt: str, num_samples: int = 10) -> T.List:
    """Generates multiple responses from the language model and extracts their logits.

    Args:
        prompt (str): The input prompt for the language model.
        num_samples (int, optional): Number of samples to generate. Defaults to 10.

    Returns:
        List: A list of logit sequences corresponding to the generated samples.

    Raises:
        ValueError: If the internal language model is not an instance of LLMWrapper.
    """
    logit_samples = []

    for _ in range(num_samples):
        if isinstance(self._llm, LLMWrapper):
            response = self._llm(prompt)
        else:
            raise ValueError(
                f"Cannot compute logits LogitUncertaintyQuantification for {type(self._llm)}"
            )
        logit_samples.append(response.logits)

    return logit_samples

`SemanticEntropyEstimator`

Bases: BaseUQModel

Source code in luq/methods/semantic_entropy.py

class SemanticEntropyEstimator(BaseUQModel):
    def __init__(self):
        """Initializes the SemanticEntropyEstimator."""
        super().__init__()

    def compute_entropy(
        self, cluster_assignments: T.List[int], sequence_probs: T.List[float] | None
    ) -> float:
        """Computes entropy over semantic clusters.

        Entropy is calculated either using:
        - Cluster sizes (discrete entropy), or
        - Weighted sequence probabilities assigned to clusters (continuous entropy).

        Args:
            cluster_assignments (List[int]): List mapping each response to a cluster ID.
            sequence_probs (List[float] | None): List of sequence probabilities. If None,
                discrete entropy is computed based on cluster sizes.

        Returns:
            float: Entropy value representing semantic uncertainty.
        """
        if sequence_probs is None:
            # Discrete Semantic Entropy
            cluster_counts = Counter(cluster_assignments)
            cluster_probs = torch.tensor(
                [
                    count / sum(cluster_counts.values())
                    for count in cluster_counts.values()
                ]
            )
        else:
            # Continuous Semantic Entropy with sequence probabilities
            cluster_probs = torch.zeros(max(cluster_assignments) + 1)
            for cluster_id, prob in zip(cluster_assignments, sequence_probs):
                cluster_probs[cluster_id] += prob
            # Normalize probabilities
            cluster_probs = cluster_probs / torch.sum(cluster_probs)

        return entropy(cluster_probs)

    def estimate_uncertainty(
        self,
        samples: LLMSamples,
        seq_prob_mode: SeqProbMode = SeqProbMode.PROD,
        nli_model: NLIWrapper | None = None,
        nli_table: NLITable | None = None,
        **kwargs,
    ) -> float:
        """Estimates uncertainty based on the semantic diversity of LLM responses.

        Semantic uncertainty is computed by clustering responses into meaning-based groups
        using an NLI model or precomputed NLI table, and then calculating entropy across
        these clusters.

        Args:
            samples (LLMSamples): List of LLM responses containing text and log-probabilities.
            seq_prob_mode (SeqProbMode, optional): Defines how to compute sequence probabilities
                from token log-probabilities. Defaults to SeqProbMode.PROD.
            nli_model (NLIWrapper | None, optional): NLI model used to compute entailment-based similarity.
            nli_table (NLITable | None, optional): Precomputed NLI similarity table to avoid recomputation.
            **kwargs: Additional arguments for future extensibility.

        Returns:
            float: Estimated entropy based on semantic clustering.

        Raises:
            ValueError: If neither or both of `nli_model` and `nli_table` are provided.
        """

        # validation
        if nli_model is None and nli_table is None:
            raise ValueError("Either `nli_model` or `nli_table` should be provided")

        if nli_model is not None and nli_table is not None:
            raise ValueError(
                "Only one of `nli_model` and `nli_table` should be provided"
            )

        logit_samples = [s.logprobs for s in samples.samples]

        # Compute sequence probabilities
        sequence_probs = [
            self.compute_sequence_probability(logits, seq_prob_mode)
            for logits in logit_samples
        ]

        if nli_table is None:
            nli_table = construct_nli_table(samples, nli_model)

        # Cluster responses
        cluster_assignments = hard_nli_clustering(samples, nli_table)

        # Compute entropy over clusters
        return self.compute_entropy(cluster_assignments, sequence_probs)

`init()`

Initializes the SemanticEntropyEstimator.

Source code in luq/methods/semantic_entropy.py

def __init__(self):
    """Initializes the SemanticEntropyEstimator."""
    super().__init__()

`compute_entropy(cluster_assignments, sequence_probs)`

Computes entropy over semantic clusters.

Entropy is calculated either using: - Cluster sizes (discrete entropy), or - Weighted sequence probabilities assigned to clusters (continuous entropy).

Parameters:

Name	Type	Description	Default
`cluster_assignments`	`List[int]`	List mapping each response to a cluster ID.	required
`sequence_probs`	`List[float] \| None`	List of sequence probabilities. If None, discrete entropy is computed based on cluster sizes.	required

Returns:

Name	Type	Description
`float`	`float`	Entropy value representing semantic uncertainty.

Source code in luq/methods/semantic_entropy.py

def compute_entropy(
    self, cluster_assignments: T.List[int], sequence_probs: T.List[float] | None
) -> float:
    """Computes entropy over semantic clusters.

    Entropy is calculated either using:
    - Cluster sizes (discrete entropy), or
    - Weighted sequence probabilities assigned to clusters (continuous entropy).

    Args:
        cluster_assignments (List[int]): List mapping each response to a cluster ID.
        sequence_probs (List[float] | None): List of sequence probabilities. If None,
            discrete entropy is computed based on cluster sizes.

    Returns:
        float: Entropy value representing semantic uncertainty.
    """
    if sequence_probs is None:
        # Discrete Semantic Entropy
        cluster_counts = Counter(cluster_assignments)
        cluster_probs = torch.tensor(
            [
                count / sum(cluster_counts.values())
                for count in cluster_counts.values()
            ]
        )
    else:
        # Continuous Semantic Entropy with sequence probabilities
        cluster_probs = torch.zeros(max(cluster_assignments) + 1)
        for cluster_id, prob in zip(cluster_assignments, sequence_probs):
            cluster_probs[cluster_id] += prob
        # Normalize probabilities
        cluster_probs = cluster_probs / torch.sum(cluster_probs)

    return entropy(cluster_probs)

`estimate_uncertainty(samples, seq_prob_mode=SeqProbMode.PROD, nli_model=None, nli_table=None, **kwargs)`

Estimates uncertainty based on the semantic diversity of LLM responses.

Semantic uncertainty is computed by clustering responses into meaning-based groups using an NLI model or precomputed NLI table, and then calculating entropy across these clusters.

Parameters:

Name	Type	Description	Default
`samples`	`LLMSamples`	List of LLM responses containing text and log-probabilities.	required
`seq_prob_mode`	`SeqProbMode`	Defines how to compute sequence probabilities from token log-probabilities. Defaults to SeqProbMode.PROD.	`PROD`
`nli_model`	`NLIWrapper \| None`	NLI model used to compute entailment-based similarity.	`None`
`nli_table`	`NLITable \| None`	Precomputed NLI similarity table to avoid recomputation.	`None`
`**kwargs`		Additional arguments for future extensibility.	`{}`

Returns:

Name	Type	Description
`float`	`float`	Estimated entropy based on semantic clustering.

Raises:

Type	Description
`ValueError`	If neither or both of `nli_model` and `nli_table` are provided.

Source code in luq/methods/semantic_entropy.py

def estimate_uncertainty(
    self,
    samples: LLMSamples,
    seq_prob_mode: SeqProbMode = SeqProbMode.PROD,
    nli_model: NLIWrapper | None = None,
    nli_table: NLITable | None = None,
    **kwargs,
) -> float:
    """Estimates uncertainty based on the semantic diversity of LLM responses.

    Semantic uncertainty is computed by clustering responses into meaning-based groups
    using an NLI model or precomputed NLI table, and then calculating entropy across
    these clusters.

    Args:
        samples (LLMSamples): List of LLM responses containing text and log-probabilities.
        seq_prob_mode (SeqProbMode, optional): Defines how to compute sequence probabilities
            from token log-probabilities. Defaults to SeqProbMode.PROD.
        nli_model (NLIWrapper | None, optional): NLI model used to compute entailment-based similarity.
        nli_table (NLITable | None, optional): Precomputed NLI similarity table to avoid recomputation.
        **kwargs: Additional arguments for future extensibility.

    Returns:
        float: Estimated entropy based on semantic clustering.

    Raises:
        ValueError: If neither or both of `nli_model` and `nli_table` are provided.
    """

    # validation
    if nli_model is None and nli_table is None:
        raise ValueError("Either `nli_model` or `nli_table` should be provided")

    if nli_model is not None and nli_table is not None:
        raise ValueError(
            "Only one of `nli_model` and `nli_table` should be provided"
        )

    logit_samples = [s.logprobs for s in samples.samples]

    # Compute sequence probabilities
    sequence_probs = [
        self.compute_sequence_probability(logits, seq_prob_mode)
        for logits in logit_samples
    ]

    if nli_table is None:
        nli_table = construct_nli_table(samples, nli_model)

    # Cluster responses
    cluster_assignments = hard_nli_clustering(samples, nli_table)

    # Compute entropy over clusters
    return self.compute_entropy(cluster_assignments, sequence_probs)

`SeqProbMode`

Bases: Enum

Enumeration for modes of combining token probabilities in a sequence.

Attributes:

Name	Type	Description
`PROD`		Use the product of probabilities.
`AVG`		Use the average of probabilities.

Source code in luq/utils/utils.py

class SeqProbMode(Enum):
    """
    Enumeration for modes of combining token probabilities in a sequence.

    Attributes:
        PROD: Use the product of probabilities.
        AVG: Use the average of probabilities.
    """
    PROD = "prod"
    AVG = "avg"

`TopKGapEstimator`

Bases: BaseUQModel

Source code in luq/methods/top_k_gap.py

class TopKGapEstimator(BaseUQModel):
    def estimate_uncertainty(
        self,
        samples: T.List[LLMOutput],
        seq_prob_mode: SeqProbMode = SeqProbMode.PROD,
        k: int = 2,
        **kwargs,
    ) -> float:
        """Estimates uncertainty using the gap between the top-k sequence probabilities.

        The method computes sequence-level probabilities from the sampled responses,
        identifies the `k` highest probabilities, and returns a normalized uncertainty
        score as `1 - (gap between top-1 and top-k probabilities)`.

        A smaller gap between top-k and top-1 implies higher uncertainty (less confident top choice),
        while a large gap suggests stronger model confidence.

        Args:
            samples (List[LLMOutput]): A list of LLM outputs containing log-probabilities.
            seq_prob_mode (SeqProbMode, optional): Method for combining token log-probabilities into
                a single sequence probability. Defaults to `SeqProbMode.PROD`.
            k (int, optional): The number of top probabilities to compare. Must be >= 2. Defaults to 2.
            **kwargs: Additional arguments for extensibility (unused).

        Returns:
            float: A normalized uncertainty score based on the gap between top-1 and top-k probabilities.

        Raises:
            ValueError: If `k` is less than 2.
            AssertionError: If any sample does not contain log-probabilities.
        """
        if k < 2:
            raise ValueError("k should >= 2")
        assert all(s.logprobs is not None for s in samples.samples)

        logit_samples = [s.logprobs for s in samples.samples]
        sequence_probs = [
            self.compute_sequence_probability(logits, seq_prob_mode)
            for logits in logit_samples
        ]
        sorted_seq_probs = sorted(sequence_probs)
        gap = sorted_seq_probs[-1] - sorted_seq_probs[-k]
        return 1 - gap

`estimate_uncertainty(samples, seq_prob_mode=SeqProbMode.PROD, k=2, **kwargs)`

Estimates uncertainty using the gap between the top-k sequence probabilities.

The method computes sequence-level probabilities from the sampled responses, identifies the k highest probabilities, and returns a normalized uncertainty score as 1 - (gap between top-1 and top-k probabilities).

A smaller gap between top-k and top-1 implies higher uncertainty (less confident top choice), while a large gap suggests stronger model confidence.

Parameters:

Name	Type	Description	Default
`samples`	`List[LLMOutput]`	A list of LLM outputs containing log-probabilities.	required
`seq_prob_mode`	`SeqProbMode`	Method for combining token log-probabilities into a single sequence probability. Defaults to `SeqProbMode.PROD`.	`PROD`
`k`	`int`	The number of top probabilities to compare. Must be >= 2. Defaults to 2.	`2`
`**kwargs`		Additional arguments for extensibility (unused).	`{}`

Returns:

Name	Type	Description
`float`	`float`	A normalized uncertainty score based on the gap between top-1 and top-k probabilities.

Raises:

Type	Description
`ValueError`	If `k` is less than 2.
`AssertionError`	If any sample does not contain log-probabilities.

Source code in luq/methods/top_k_gap.py

def estimate_uncertainty(
    self,
    samples: T.List[LLMOutput],
    seq_prob_mode: SeqProbMode = SeqProbMode.PROD,
    k: int = 2,
    **kwargs,
) -> float:
    """Estimates uncertainty using the gap between the top-k sequence probabilities.

    The method computes sequence-level probabilities from the sampled responses,
    identifies the `k` highest probabilities, and returns a normalized uncertainty
    score as `1 - (gap between top-1 and top-k probabilities)`.

    A smaller gap between top-k and top-1 implies higher uncertainty (less confident top choice),
    while a large gap suggests stronger model confidence.

    Args:
        samples (List[LLMOutput]): A list of LLM outputs containing log-probabilities.
        seq_prob_mode (SeqProbMode, optional): Method for combining token log-probabilities into
            a single sequence probability. Defaults to `SeqProbMode.PROD`.
        k (int, optional): The number of top probabilities to compare. Must be >= 2. Defaults to 2.
        **kwargs: Additional arguments for extensibility (unused).

    Returns:
        float: A normalized uncertainty score based on the gap between top-1 and top-k probabilities.

    Raises:
        ValueError: If `k` is less than 2.
        AssertionError: If any sample does not contain log-probabilities.
    """
    if k < 2:
        raise ValueError("k should >= 2")
    assert all(s.logprobs is not None for s in samples.samples)

    logit_samples = [s.logprobs for s in samples.samples]
    sequence_probs = [
        self.compute_sequence_probability(logits, seq_prob_mode)
        for logits in logit_samples
    ]
    sorted_seq_probs = sorted(sequence_probs)
    gap = sorted_seq_probs[-1] - sorted_seq_probs[-k]
    return 1 - gap

`construct_nli_table(samples, nli_model)`

Constructs a table of NLI results for all pairs of generated samples.

Parameters:

Name	Type	Description	Default
`samples`	`LLMSamples`	The generated language model outputs.	required
`nli_model`	`NLIWrapper`	An NLI model wrapper used to evaluate relationships between outputs.	required

Returns:

Name	Type	Description
`NLITable`	`NLITable`	A dictionary mapping (answer1, answer2) pairs to NLIOutput results.

Source code in luq/models/nli.py

def construct_nli_table(samples: LLMSamples, nli_model: NLIWrapper) -> NLITable:
    """
    Constructs a table of NLI results for all pairs of generated samples.

    Args:
        samples (LLMSamples): The generated language model outputs.
        nli_model (NLIWrapper): An NLI model wrapper used to evaluate relationships between outputs.

    Returns:
        NLITable: A dictionary mapping (answer1, answer2) pairs to NLIOutput results.
    """
    result = {}
    for i, s1 in enumerate(samples.samples):
        for s2 in samples.samples:
            answer1, answer2 = s1.answer, s2.answer
            if (answer1, answer2) in result:
                continue
            nli_output: NLIOutput = nli_model(answer1, answer2, params=samples.params)
            result[(answer1, answer2)] = nli_output
    return result

`entropy(probabilities)`

Computes the entropy of a probability distribution.

Parameters:

Name	Type	Description	Default
`probabilities`	`Union[List[float], Tensor]`	A list or tensor of probabilities. The probabilities should sum to 1 and represent a valid distribution.	required

Returns:

Type	Description
`Tensor`	torch.Tensor: The computed entropy as a scalar tensor value.

Notes

Adds a small epsilon (1e-9) to probabilities to avoid log(0).

Source code in luq/utils/utils.py

def entropy(probabilities: Union[List[float], torch.Tensor]) -> torch.Tensor:
    """
    Computes the entropy of a probability distribution.

    Args:
        probabilities (Union[List[float], torch.Tensor]): A list or tensor of probabilities.
            The probabilities should sum to 1 and represent a valid distribution.

    Returns:
        torch.Tensor: The computed entropy as a scalar tensor value.

    Notes:
        Adds a small epsilon (1e-9) to probabilities to avoid log(0).
    """
    probabilities = (
        torch.tensor(probabilities, dtype=torch.float32)
        if isinstance(probabilities, list)
        else probabilities
    )
    entropy_value = -torch.sum(probabilities * torch.log(probabilities + 1e-9))
    return entropy_value.item()

`hard_nli_clustering(samples, nli_table)`

Performs hard clustering of samples based on mutual entailment using NLI results.

Parameters:

Name	Type	Description	Default
`samples`	`LLMSamples`	The list of LLM-generated samples.	required
`nli_table`	`NLITable`	A dictionary of NLI outputs between sample pairs.	required

Returns:

Type	Description
`List[int]`	List[int]: A list of cluster assignments (by index) for each sample.

Source code in luq/models/nli.py

def hard_nli_clustering(samples: LLMSamples, nli_table: NLITable) -> T.List[int]:
    """
    Performs hard clustering of samples based on mutual entailment using NLI results.

    Args:
        samples (LLMSamples): The list of LLM-generated samples.
        nli_table (NLITable): A dictionary of NLI outputs between sample pairs.

    Returns:
        List[int]: A list of cluster assignments (by index) for each sample.
    """
    clusters = [None] * len(samples.samples)
    last_cluster = 0
    for i, s1 in enumerate(samples.samples):
        if clusters[i] is None:
            clusters[i] = last_cluster
            last_cluster += 1
        for j, s2 in enumerate(samples.samples[i + 1 :], i + 1):
            if clusters[j] is not None:
                continue
            if (
                nli_table[(s1.answer, s2.answer)].cls == NLIResult.ENTAILMENT
                and nli_table[(s2.answer, s1.answer)].cls == NLIResult.ENTAILMENT
            ):
                clusters[j] = clusters[i]
    return clusters

`von_neumann_entropy(rho)`

Compute the von Neumann entropy of a density matrix.

The von Neumann entropy is defined as

S(ρ) = -Tr(ρ log(ρ))

Parameters:

Name	Type	Description	Default
`rho`	`Tensor`	A Hermitian, positive semi-definite matrix (density matrix).	required

Returns:

Type	Description
`Tensor`	torch.Tensor: Scalar tensor representing the entropy.

Source code in luq/methods/kernel_utils.py

def von_neumann_entropy(rho: torch.Tensor) -> torch.Tensor:
    """Compute the von Neumann entropy of a density matrix.

    The von Neumann entropy is defined as:
        S(ρ) = -Tr(ρ log(ρ))

    Args:
        rho (torch.Tensor): A Hermitian, positive semi-definite matrix (density matrix).

    Returns:
        torch.Tensor: Scalar tensor representing the entropy.
    """
    # Compute eigenvalues (ensuring they are real since rho should be Hermitian)
    eigenvalues = torch.linalg.eigvalsh(rho)

    # Avoid log(0) by masking zero values
    nonzero_eigenvalues = eigenvalues[eigenvalues > 0]

    # Compute entropy
    entropy = -torch.sum(nonzero_eigenvalues * torch.log(nonzero_eigenvalues))

    return entropy

Models

`AzureCustomGPT4Wrapper`

Wrapper for Azure-hosted GPT-4 model using OpenAI-compatible API.

Source code in luq/models/llm.py

class AzureCustomGPT4Wrapper:
    """
    Wrapper for Azure-hosted GPT-4 model using OpenAI-compatible API.
    """
    def __init__(self, openai_endpoint_url, api_key):
        """
        Initializes the Azure GPT-4 wrapper.

        Args:
            openai_endpoint_url (str): The base URL of the Azure OpenAI endpoint.
            api_key (str): Azure API key for authentication.
        """
        self.openai_endpoint_url = openai_endpoint_url
        self.client = OpenAI(
            base_url=openai_endpoint_url,
            api_key=False,
            default_headers={
                "Ocp-Apim-Subscription-Key": api_key,
            },
            http_client=httpx.Client(
                event_hooks={"request": [functools.partial(update_base_url, openai_endpoint_url=openai_endpoint_url)]}
            ),
        )

    def __call__(self, input: str) -> LLMOutput:
        """
        Generates a response using the Azure-hosted GPT-4 model.

        Args:
            input (str): User input prompt.

        Returns:
            LLMOutput: Generated answer and optional log probabilities.
        """
        kwargs = {
            "model": "no_effect",  # Replace with your actual deployment name
            "logprobs": True,
            "messages": [{"role": "user", "content": input}],
            "top_logprobs": 5,
        }
        response = self.client.chat.completions.create(**kwargs)
        content = response.choices[0].message.content
        token_logprobs = response.choices[0].logprobs.token_logprobs
        if token_logprobs is not None:
            logprobs_tensor = torch.tensor(token_logprobs, dtype=torch.float32)
        else:
            logprobs_tensor = None
        return LLMOutput(answer=content, logprobs=logprobs_tensor)

`call(input)`

Generates a response using the Azure-hosted GPT-4 model.

Parameters:

Name	Type	Description	Default
`input`	`str`	User input prompt.	required

Returns:

Name	Type	Description
`LLMOutput`	`LLMOutput`	Generated answer and optional log probabilities.

Source code in luq/models/llm.py

def __call__(self, input: str) -> LLMOutput:
    """
    Generates a response using the Azure-hosted GPT-4 model.

    Args:
        input (str): User input prompt.

    Returns:
        LLMOutput: Generated answer and optional log probabilities.
    """
    kwargs = {
        "model": "no_effect",  # Replace with your actual deployment name
        "logprobs": True,
        "messages": [{"role": "user", "content": input}],
        "top_logprobs": 5,
    }
    response = self.client.chat.completions.create(**kwargs)
    content = response.choices[0].message.content
    token_logprobs = response.choices[0].logprobs.token_logprobs
    if token_logprobs is not None:
        logprobs_tensor = torch.tensor(token_logprobs, dtype=torch.float32)
    else:
        logprobs_tensor = None
    return LLMOutput(answer=content, logprobs=logprobs_tensor)

`init(openai_endpoint_url, api_key)`

Initializes the Azure GPT-4 wrapper.

Parameters:

Name	Type	Description	Default
`openai_endpoint_url`	`str`	The base URL of the Azure OpenAI endpoint.	required
`api_key`	`str`	Azure API key for authentication.	required

Source code in luq/models/llm.py

def __init__(self, openai_endpoint_url, api_key):
    """
    Initializes the Azure GPT-4 wrapper.

    Args:
        openai_endpoint_url (str): The base URL of the Azure OpenAI endpoint.
        api_key (str): Azure API key for authentication.
    """
    self.openai_endpoint_url = openai_endpoint_url
    self.client = OpenAI(
        base_url=openai_endpoint_url,
        api_key=False,
        default_headers={
            "Ocp-Apim-Subscription-Key": api_key,
        },
        http_client=httpx.Client(
            event_hooks={"request": [functools.partial(update_base_url, openai_endpoint_url=openai_endpoint_url)]}
        ),
    )

`BatchLLMWrapper`

Abstract class for batch LLM interfaces that return multiple LLM outputs.

Source code in luq/models/llm.py

class BatchLLMWrapper:
    """
    Abstract class for batch LLM interfaces that return multiple LLM outputs.
    """
    def __call__(self, *args, **kwargs) -> T.List[LLMOutput]:
        """
        Generate multiple responses from an LLM.

        Returns:
            List[LLMOutput]: List of outputs from the model.

        Raises:
            NotImplementedError: If not implemented in a subclass.
        """
        raise NotImplementedError("__call__ should be implemented for your LLM")

`call(*args, **kwargs)`

Generate multiple responses from an LLM.

Returns:

Type	Description
`List[LLMOutput]`	List[LLMOutput]: List of outputs from the model.

Raises:

Type	Description
`NotImplementedError`	If not implemented in a subclass.

Source code in luq/models/llm.py

def __call__(self, *args, **kwargs) -> T.List[LLMOutput]:
    """
    Generate multiple responses from an LLM.

    Returns:
        List[LLMOutput]: List of outputs from the model.

    Raises:
        NotImplementedError: If not implemented in a subclass.
    """
    raise NotImplementedError("__call__ should be implemented for your LLM")

`ClaudeWrapper`

Wrapper for Anthropic's Claude models.

Source code in luq/models/llm.py

class ClaudeWrapper:
    """
    Wrapper for Anthropic's Claude models.
    """

    def __init__(self, api_key: str):
        """
        Initializes the ClaudeWrapper.

        Args:
            api_key (str): Anthropic API key.
        """
        self.client = Anthropic(api_key=api_key)

    def __call__(self, prompt: str, model: str = "claude-3-opus-20240229", temperature: float = 1.0, max_tokens: int = 1024) -> LLMOutput:
        """
        Generates a response from Claude with optional parameters.

        Note:
            Claude's API currently does not support returning log probabilities.

        Args:
            prompt (str): Input prompt for Claude.
            model (str): Claude model to use.
            temperature (float): Sampling temperature.
            max_tokens (int): Maximum number of tokens to generate.

        Returns:
            LLMOutput: Generated answer text.
        """
        # Anthropic API does not currently support logprobs in the chat API.
        try:
            response = self.client.messages.create(
                model=model,
                max_tokens=max_tokens,
                messages=[
                    {"role": "user", "content": prompt}
                ],
                temperature=temperature,
            )

            # Extract text response
            text = response.content[0].text if response.content else ""
            return LLMOutput(answer=text)

        except Exception as e:
            raise RuntimeError(f"Claude API call failed: {e}")

`call(prompt, model='claude-3-opus-20240229', temperature=1.0, max_tokens=1024)`

Generates a response from Claude with optional parameters.

Note

Claude's API currently does not support returning log probabilities.

Parameters:

Name	Type	Description	Default
`prompt`	`str`	Input prompt for Claude.	required
`model`	`str`	Claude model to use.	`'claude-3-opus-20240229'`
`temperature`	`float`	Sampling temperature.	`1.0`
`max_tokens`	`int`	Maximum number of tokens to generate.	`1024`

Returns:

Name	Type	Description
`LLMOutput`	`LLMOutput`	Generated answer text.

Source code in luq/models/llm.py

def __call__(self, prompt: str, model: str = "claude-3-opus-20240229", temperature: float = 1.0, max_tokens: int = 1024) -> LLMOutput:
    """
    Generates a response from Claude with optional parameters.

    Note:
        Claude's API currently does not support returning log probabilities.

    Args:
        prompt (str): Input prompt for Claude.
        model (str): Claude model to use.
        temperature (float): Sampling temperature.
        max_tokens (int): Maximum number of tokens to generate.

    Returns:
        LLMOutput: Generated answer text.
    """
    # Anthropic API does not currently support logprobs in the chat API.
    try:
        response = self.client.messages.create(
            model=model,
            max_tokens=max_tokens,
            messages=[
                {"role": "user", "content": prompt}
            ],
            temperature=temperature,
        )

        # Extract text response
        text = response.content[0].text if response.content else ""
        return LLMOutput(answer=text)

    except Exception as e:
        raise RuntimeError(f"Claude API call failed: {e}")

`init(api_key)`

Initializes the ClaudeWrapper.

Parameters:

Name	Type	Description	Default
`api_key`	`str`	Anthropic API key.	required

Source code in luq/models/llm.py

def __init__(self, api_key: str):
    """
    Initializes the ClaudeWrapper.

    Args:
        api_key (str): Anthropic API key.
    """
    self.client = Anthropic(api_key=api_key)

`HFLLMWrapper`

Bases: LLMWrapper

Hugging Face LLM wrapper using a tokenizer and model from the transformers library.

Source code in luq/models/llm.py

class HFLLMWrapper(LLMWrapper):
    """
    Hugging Face LLM wrapper using a tokenizer and model from the transformers library.
    """
    def __init__(
        self, tokenizer: transformers.AutoTokenizer, model: transformers.PreTrainedModel
    ):
        """
        Initializes the HFLLMWrapper with a tokenizer and model.

        Args:
            tokenizer (transformers.AutoTokenizer): A Hugging Face tokenizer instance.
            model (transformers.PreTrainedModel): A Hugging Face model instance.

        Raises:
            ValueError: If the tokenizer or model is not a valid Hugging Face object.
        """
        if isinstance(tokenizer, transformers.PreTrainedTokenizerBase):
            self.tokenizer = tokenizer
        else:
            raise ValueError("Requires a text generation pipeline from transformers")
        if isinstance(model, transformers.PreTrainedModel):
            self.model = model
        else:
            raise ValueError("Requires a text generation pipeline from transformers")

    def __call__(
        self,
        prompt: str,
        temperature: float = 1.0,
        max_new_tokens=1024,
        *args,
        **kwargs
    ) -> LLMOutput:
        """
        Generates a response from the Hugging Face model.

        Args:
            prompt (str): The prompt to send to the model.
            temperature (float): Sampling temperature.
            max_new_tokens (int): Maximum number of tokens to generate.
            *args: Additional positional arguments.
            **kwargs: Additional keyword arguments.

        Returns:
            LLMOutput: The generated text and associated log probabilities.
        """
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=50,
                return_dict_in_generate=True,
                output_scores=True,
                do_sample=True,
                temperature=1.0,
            )
        generated_ids = outputs.sequences[0]
        generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
        token_scores = outputs.scores
        generated_tokens = generated_ids[len(inputs["input_ids"][0]) :]

        logprobs = []
        for i, token_id in enumerate(generated_tokens):
            logits = token_scores[i][0]
            log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
            token_logprob = log_probs[token_id].item()
            logprobs.append((self.tokenizer.decode([token_id]), token_logprob))

        # logprobs is a list of pairs (token, logprob)
        logprobs = [el[1] for el in logprobs]
        return LLMOutput(
            answer=generated_text,
            logprobs=torch.tensor(logprobs, device=self.model.device),
        )

`call(prompt, temperature=1.0, max_new_tokens=1024, *args, **kwargs)`

Generates a response from the Hugging Face model.

Parameters:

Name	Type	Description	Default
`prompt`	`str`	The prompt to send to the model.	required
`temperature`	`float`	Sampling temperature.	`1.0`
`max_new_tokens`	`int`	Maximum number of tokens to generate.	`1024`
`*args`		Additional positional arguments.	`()`
`**kwargs`		Additional keyword arguments.	`{}`

Returns:

Name	Type	Description
`LLMOutput`	`LLMOutput`	The generated text and associated log probabilities.

Source code in luq/models/llm.py

def __call__(
    self,
    prompt: str,
    temperature: float = 1.0,
    max_new_tokens=1024,
    *args,
    **kwargs
) -> LLMOutput:
    """
    Generates a response from the Hugging Face model.

    Args:
        prompt (str): The prompt to send to the model.
        temperature (float): Sampling temperature.
        max_new_tokens (int): Maximum number of tokens to generate.
        *args: Additional positional arguments.
        **kwargs: Additional keyword arguments.

    Returns:
        LLMOutput: The generated text and associated log probabilities.
    """
    inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)

    with torch.no_grad():
        outputs = self.model.generate(
            **inputs,
            max_new_tokens=50,
            return_dict_in_generate=True,
            output_scores=True,
            do_sample=True,
            temperature=1.0,
        )
    generated_ids = outputs.sequences[0]
    generated_text = self.tokenizer.decode(generated_ids, skip_special_tokens=True)
    token_scores = outputs.scores
    generated_tokens = generated_ids[len(inputs["input_ids"][0]) :]

    logprobs = []
    for i, token_id in enumerate(generated_tokens):
        logits = token_scores[i][0]
        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
        token_logprob = log_probs[token_id].item()
        logprobs.append((self.tokenizer.decode([token_id]), token_logprob))

    # logprobs is a list of pairs (token, logprob)
    logprobs = [el[1] for el in logprobs]
    return LLMOutput(
        answer=generated_text,
        logprobs=torch.tensor(logprobs, device=self.model.device),
    )

`init(tokenizer, model)`

Initializes the HFLLMWrapper with a tokenizer and model.

Parameters:

Name	Type	Description	Default
`tokenizer`	`AutoTokenizer`	A Hugging Face tokenizer instance.	required
`model`	`PreTrainedModel`	A Hugging Face model instance.	required

Raises:

Type	Description
`ValueError`	If the tokenizer or model is not a valid Hugging Face object.

Source code in luq/models/llm.py

def __init__(
    self, tokenizer: transformers.AutoTokenizer, model: transformers.PreTrainedModel
):
    """
    Initializes the HFLLMWrapper with a tokenizer and model.

    Args:
        tokenizer (transformers.AutoTokenizer): A Hugging Face tokenizer instance.
        model (transformers.PreTrainedModel): A Hugging Face model instance.

    Raises:
        ValueError: If the tokenizer or model is not a valid Hugging Face object.
    """
    if isinstance(tokenizer, transformers.PreTrainedTokenizerBase):
        self.tokenizer = tokenizer
    else:
        raise ValueError("Requires a text generation pipeline from transformers")
    if isinstance(model, transformers.PreTrainedModel):
        self.model = model
    else:
        raise ValueError("Requires a text generation pipeline from transformers")

`LLMOutput` `dataclass`

Represents the output of a language model.

Attributes:

Name	Type	Description
`answer`	`str`	The generated text answer from the language model.
`logprobs`	`Tensor \| None`	Optional tensor containing the log probabilities associated with the generated tokens.

Source code in luq/models/llm.py

@dataclass
class LLMOutput:
    """
    Represents the output of a language model.

    Attributes:
        answer (str): The generated text answer from the language model.
        logprobs (torch.Tensor | None): Optional tensor containing the log probabilities
            associated with the generated tokens.
    """
    answer: str
    logprobs: torch.Tensor | None = None  # list of logprobs

`LLMSamples` `dataclass`

Contains multiple samples generated by a language model along with metadata.

Attributes:

Name	Type	Description
`samples`	`List[LLMOutput]`	A list of multiple LLMOutput samples.
`answer`	`LLMOutput`	The selected or final answer output.
`params`	`Dict[str, Any]`	Parameters used to generate the samples.

Source code in luq/models/llm.py

@dataclass
class LLMSamples:
    """
    Contains multiple samples generated by a language model along with metadata.

    Attributes:
        samples (List[LLMOutput]): A list of multiple LLMOutput samples.
        answer (LLMOutput): The selected or final answer output.
        params (Dict[str, Any]): Parameters used to generate the samples.
    """
    samples: T.List[LLMOutput]
    answer: LLMOutput
    params: T.Dict[str, T.Any]

    def __len__(self) -> int:
        """
        Returns the number of samples generated.

        Returns:
            int: The count of samples.
        """
        return len(self.samples)

`len()`

Returns the number of samples generated.

Returns:

Name	Type	Description
`int`	`int`	The count of samples.

Source code in luq/models/llm.py

def __len__(self) -> int:
    """
    Returns the number of samples generated.

    Returns:
        int: The count of samples.
    """
    return len(self.samples)

`LLMWrapper`

Source code in luq/models/llm.py

class LLMWrapper:
    def __call__(self, *args, **kwargs) -> LLMOutput:
        """
        Abstract base wrapper for language model interfaces.

        This class is meant to be subclassed to implement specific LLM calls.
        """
        raise NotImplementedError("__call__ should be implemented for your LLM")

`call(*args, **kwargs)`

Abstract base wrapper for language model interfaces.

This class is meant to be subclassed to implement specific LLM calls.

Source code in luq/models/llm.py

def __call__(self, *args, **kwargs) -> LLMOutput:
    """
    Abstract base wrapper for language model interfaces.

    This class is meant to be subclassed to implement specific LLM calls.
    """
    raise NotImplementedError("__call__ should be implemented for your LLM")

`all_logits_present(samples)`

Checks whether all LLM outputs contain log probabilities.

Parameters:

Name	Type	Description	Default
`samples`	`List[LLMOutput]`	A list of LLM output samples.	required

Returns:

Name	Type	Description
`bool`	`bool`	True if all samples include logprobs, False otherwise.

Source code in luq/models/llm.py

def all_logits_present(samples: T.List[LLMOutput]) -> bool:
    """
    Checks whether all LLM outputs contain log probabilities.

    Args:
        samples (List[LLMOutput]): A list of LLM output samples.

    Returns:
        bool: True if all samples include logprobs, False otherwise.
    """
    return all(sample.logprobs is not None for sample in samples)

`generate_n_samples_and_answer(llm, prompt, temp_gen=1.0, temp_answer=0.1, top_p_gen=0.9, top_k_gen=16, top_p_ans=0.7, top_k_ans=4, n_samples=10)`

Generates multiple LLM samples and a single final answer using specified parameters.

Parameters:

Name	Type	Description	Default
`llm`	`LLMWrapper`	The language model wrapper to use.	required
`prompt`	`str`	The prompt to pass to the LLM.	required
`temp_gen`	`float`	Temperature for generating samples.	`1.0`
`temp_answer`	`float`	Temperature for the final answer.	`0.1`
`top_p_gen`	`float`	Nucleus sampling parameter for generation.	`0.9`
`top_k_gen`	`int`	Top-k sampling parameter for generation.	`16`
`top_p_ans`	`float`	Nucleus sampling parameter for answer.	`0.7`
`top_k_ans`	`int`	Top-k sampling parameter for answer.	`4`
`n_samples`	`int`	Number of samples to generate.	`10`

Returns:

Name	Type	Description
`LLMSamples`	`LLMSamples`	A collection of generated samples, the final answer, and parameters used.

Raises:

Type	Description
`NotImplementedError`	If `llm` is not an instance of LLMWrapper.

Source code in luq/models/llm.py

def generate_n_samples_and_answer(
    llm: LLMWrapper,
    prompt: str,
    temp_gen: float = 1.0,
    temp_answer: float = 0.1,
    top_p_gen: float = 0.9,
    top_k_gen: float = 16,
    top_p_ans: float = 0.7,
    top_k_ans: float = 4,
    n_samples: int = 10,
) -> LLMSamples:
    """
    Generates multiple LLM samples and a single final answer using specified parameters.

    Args:
        llm (LLMWrapper): The language model wrapper to use.
        prompt (str): The prompt to pass to the LLM.
        temp_gen (float): Temperature for generating samples.
        temp_answer (float): Temperature for the final answer.
        top_p_gen (float): Nucleus sampling parameter for generation.
        top_k_gen (int): Top-k sampling parameter for generation.
        top_p_ans (float): Nucleus sampling parameter for answer.
        top_k_ans (int): Top-k sampling parameter for answer.
        n_samples (int): Number of samples to generate.

    Returns:
        LLMSamples: A collection of generated samples, the final answer, and parameters used.

    Raises:
        NotImplementedError: If `llm` is not an instance of LLMWrapper.
    """
    if isinstance(llm, LLMWrapper):
        sampled_answers = [
            llm(prompt, temperature=temp_gen, top_p=top_p_gen, top_k=top_k_gen)
            for _ in range(n_samples)
        ]
        answer = llm(prompt, temperature=temp_gen, top_p=top_p_gen, top_k=top_k_gen)
        params = {
            "prompt": prompt,
            "temp_gen": temp_gen,
            "temp_answer": temp_answer,
            "top_p_gen": top_p_gen,
            "top_k_gen": top_k_gen,
            "top_p_ans": top_p_ans,
            "top_k_ans": top_k_ans,
            "n_samples": n_samples,
            "llm": str(llm),
        }
        return LLMSamples(samples=sampled_answers, answer=answer, params=params)
    else:
        raise NotImplementedError(
            "generation is currently supported only for LLMWrapper"
        )

`update_base_url(request, openai_endpoint_url)`

Modifies the base URL of an OpenAI request to route to a custom Azure endpoint.

Parameters:

Name	Type	Description	Default
`request`	`Request`	The HTTPX request object to be modified.	required
`openai_endpoint_url`	`str`	The Azure OpenAI endpoint path to use.	required

Source code in luq/models/llm.py

def update_base_url(request: httpx.Request, openai_endpoint_url: str) -> None:
    """
    Modifies the base URL of an OpenAI request to route to a custom Azure endpoint.

    Args:
        request (httpx.Request): The HTTPX request object to be modified.
        openai_endpoint_url (str): The Azure OpenAI endpoint path to use.
    """
    if request.url.path == "/chat/completions":
        request.url = request.url.copy_with(path=openai_endpoint_url)

Datasets

`GenerationDataset`

Bases: DatasetDict

Source code in luq/datasets/dataset.py

class GenerationDataset(DatasetDict):
    def __init__(self, data_path: str = None, arrow_table=None):
        """
        Initializes the dataset object.
        :param data_path: Path to the JSON file containing the dataset.
        """
        if data_path is not None:
            dataset_dict = self.load_from_json(data_path)
            super().__init__(dataset_dict)
        elif arrow_table is not None:
            dataset = Dataset(arrow_table)
            super().__init__({"train": dataset})
        else:
            raise ValueError("Either data_path or arrow_table must be provided")

    @staticmethod
    def load_from_json(data_path: str) -> Dataset:
        """
        Loads the dataset from a JSON file and converts it into a Hugging Face Dataset object.
        """
        with open(data_path, "r", encoding="utf-8") as f:
            raw_data = json.load(f)

        split_datasets = {}
        for split, items in raw_data["data"].items():
            processed_items = [
                {
                    "question": item["question"],
                    "samples": item["samples"],
                    "logprobs": item.get("logprobs", []),
                    "answer": item["answer"],
                    "gt_answer": item["gt_answer"],
                    "accuracy": item.get("accuracy"),
                }
                for item in items
            ]
            split_datasets[split] = Dataset.from_list(processed_items)
        return DatasetDict(split_datasets)

    def split_dataset(self, train_size: float = 0.8) -> Dict[str, "GenerationDataset"]:
        """
        Splits the dataset into train and test sets.
        :param train_size: Proportion of the dataset to include in the train split.
        :return: Dictionary containing train and test GenerationDataset objects
        """
        splits = super().train_test_split(train_size=train_size)
        return {
            "train": GenerationDataset(arrow_table=splits["train"]._data),
            "test": GenerationDataset(arrow_table=splits["test"]._data),
        }

    @classmethod
    def from_dataset(cls, dataset: Dataset) -> "GenerationDataset":
        """
        Creates a GenerationDataset from a regular Dataset object
        """
        return cls(arrow_table=dataset._data)

`init(data_path=None, arrow_table=None)`

Initializes the dataset object. :param data_path: Path to the JSON file containing the dataset.

Source code in luq/datasets/dataset.py

def __init__(self, data_path: str = None, arrow_table=None):
    """
    Initializes the dataset object.
    :param data_path: Path to the JSON file containing the dataset.
    """
    if data_path is not None:
        dataset_dict = self.load_from_json(data_path)
        super().__init__(dataset_dict)
    elif arrow_table is not None:
        dataset = Dataset(arrow_table)
        super().__init__({"train": dataset})
    else:
        raise ValueError("Either data_path or arrow_table must be provided")

`from_dataset(dataset)` `classmethod`

Creates a GenerationDataset from a regular Dataset object

Source code in luq/datasets/dataset.py

@classmethod
def from_dataset(cls, dataset: Dataset) -> "GenerationDataset":
    """
    Creates a GenerationDataset from a regular Dataset object
    """
    return cls(arrow_table=dataset._data)

`load_from_json(data_path)` `staticmethod`

Loads the dataset from a JSON file and converts it into a Hugging Face Dataset object.

Source code in luq/datasets/dataset.py

@staticmethod
def load_from_json(data_path: str) -> Dataset:
    """
    Loads the dataset from a JSON file and converts it into a Hugging Face Dataset object.
    """
    with open(data_path, "r", encoding="utf-8") as f:
        raw_data = json.load(f)

    split_datasets = {}
    for split, items in raw_data["data"].items():
        processed_items = [
            {
                "question": item["question"],
                "samples": item["samples"],
                "logprobs": item.get("logprobs", []),
                "answer": item["answer"],
                "gt_answer": item["gt_answer"],
                "accuracy": item.get("accuracy"),
            }
            for item in items
        ]
        split_datasets[split] = Dataset.from_list(processed_items)
    return DatasetDict(split_datasets)

`split_dataset(train_size=0.8)`

Splits the dataset into train and test sets. :param train_size: Proportion of the dataset to include in the train split. :return: Dictionary containing train and test GenerationDataset objects

Source code in luq/datasets/dataset.py

def split_dataset(self, train_size: float = 0.8) -> Dict[str, "GenerationDataset"]:
    """
    Splits the dataset into train and test sets.
    :param train_size: Proportion of the dataset to include in the train split.
    :return: Dictionary containing train and test GenerationDataset objects
    """
    splits = super().train_test_split(train_size=train_size)
    return {
        "train": GenerationDataset(arrow_table=splits["train"]._data),
        "test": GenerationDataset(arrow_table=splits["test"]._data),
    }

Utility Functions

`SeqProbMode`

Bases: Enum

Enumeration for modes of combining token probabilities in a sequence.

Attributes:

Name	Type	Description
`PROD`		Use the product of probabilities.
`AVG`		Use the average of probabilities.

Source code in luq/utils/utils.py

class SeqProbMode(Enum):
    """
    Enumeration for modes of combining token probabilities in a sequence.

    Attributes:
        PROD: Use the product of probabilities.
        AVG: Use the average of probabilities.
    """
    PROD = "prod"
    AVG = "avg"

`entropy(probabilities)`

Computes the entropy of a probability distribution.

Parameters:

Name	Type	Description	Default
`probabilities`	`Union[List[float], Tensor]`	A list or tensor of probabilities. The probabilities should sum to 1 and represent a valid distribution.	required

Returns:

Type	Description
`Tensor`	torch.Tensor: The computed entropy as a scalar tensor value.

Notes

Adds a small epsilon (1e-9) to probabilities to avoid log(0).

Source code in luq/utils/utils.py

def entropy(probabilities: Union[List[float], torch.Tensor]) -> torch.Tensor:
    """
    Computes the entropy of a probability distribution.

    Args:
        probabilities (Union[List[float], torch.Tensor]): A list or tensor of probabilities.
            The probabilities should sum to 1 and represent a valid distribution.

    Returns:
        torch.Tensor: The computed entropy as a scalar tensor value.

    Notes:
        Adds a small epsilon (1e-9) to probabilities to avoid log(0).
    """
    probabilities = (
        torch.tensor(probabilities, dtype=torch.float32)
        if isinstance(probabilities, list)
        else probabilities
    )
    entropy_value = -torch.sum(probabilities * torch.log(probabilities + 1e-9))
    return entropy_value.item()

LUQ API Documentation

Methods

BaseUQModel

compute_sequence_probability(logprobs, seq_prob_mode=SeqProbMode.PROD)

estimate_uncertainty(prompt, *args, **kwargs)

normalize_sequence_probs(probs, tolerance=1e-09)

KernelLanguageEntropyEstimator

__init__()

compute_entropy(kernel, normalize=False)

estimate_uncertainty(samples, seq_prob_mode=SeqProbMode.PROD, kernel_type=KernelType.HEAT, nli_model=None, nli_table=None, construct_kernel=None, **kwargs)

get_kernel(samples, kernel_type=None, construct_kernel=None, nli_model=None, nli_table=None)

KernelType

LLMOutput dataclass

LLMSamples dataclass

__len__()

LLMWrapper

__call__(*args, **kwargs)

MaxProbabilityEstimator

estimate_uncertainty(samples, seq_prob_mode=SeqProbMode.PROD, **kwargs)

NLIWrapper

__call__(*args, **kwargs)

PredictiveEntropyEstimator

compute_entropy(sequence_probs)

estimate_uncertainty(samples, seq_prob_mode=SeqProbMode.PROD, **kwargs)

generate_logits(prompt, num_samples=10)

SemanticEntropyEstimator

__init__()

compute_entropy(cluster_assignments, sequence_probs)

estimate_uncertainty(samples, seq_prob_mode=SeqProbMode.PROD, nli_model=None, nli_table=None, **kwargs)

SeqProbMode

TopKGapEstimator

estimate_uncertainty(samples, seq_prob_mode=SeqProbMode.PROD, k=2, **kwargs)

construct_nli_table(samples, nli_model)

entropy(probabilities)

hard_nli_clustering(samples, nli_table)

von_neumann_entropy(rho)

Models

AzureCustomGPT4Wrapper

__call__(input)

__init__(openai_endpoint_url, api_key)

BatchLLMWrapper

__call__(*args, **kwargs)

ClaudeWrapper

__call__(prompt, model='claude-3-opus-20240229', temperature=1.0, max_tokens=1024)

__init__(api_key)

HFLLMWrapper

__call__(prompt, temperature=1.0, max_new_tokens=1024, *args, **kwargs)

__init__(tokenizer, model)

LLMOutput dataclass

LLMSamples dataclass

__len__()

LLMWrapper

__call__(*args, **kwargs)

all_logits_present(samples)

generate_n_samples_and_answer(llm, prompt, temp_gen=1.0, temp_answer=0.1, top_p_gen=0.9, top_k_gen=16, top_p_ans=0.7, top_k_ans=4, n_samples=10)

update_base_url(request, openai_endpoint_url)

Datasets

GenerationDataset

__init__(data_path=None, arrow_table=None)

from_dataset(dataset) classmethod

load_from_json(data_path) staticmethod

split_dataset(train_size=0.8)

Utility Functions

SeqProbMode

entropy(probabilities)

Scripts

`BaseUQModel`

`compute_sequence_probability(logprobs, seq_prob_mode=SeqProbMode.PROD)`

`estimate_uncertainty(prompt, *args, **kwargs)`

`normalize_sequence_probs(probs, tolerance=1e-09)`

`KernelLanguageEntropyEstimator`

`init()`

`compute_entropy(kernel, normalize=False)`

`estimate_uncertainty(samples, seq_prob_mode=SeqProbMode.PROD, kernel_type=KernelType.HEAT, nli_model=None, nli_table=None, construct_kernel=None, **kwargs)`

`get_kernel(samples, kernel_type=None, construct_kernel=None, nli_model=None, nli_table=None)`

`KernelType`

`LLMOutput` `dataclass`

`LLMSamples` `dataclass`

`len()`

`LLMWrapper`

`call(*args, **kwargs)`

`MaxProbabilityEstimator`

`estimate_uncertainty(samples, seq_prob_mode=SeqProbMode.PROD, **kwargs)`

`NLIWrapper`

`call(*args, **kwargs)`

`PredictiveEntropyEstimator`

`compute_entropy(sequence_probs)`

`estimate_uncertainty(samples, seq_prob_mode=SeqProbMode.PROD, **kwargs)`

`generate_logits(prompt, num_samples=10)`

`SemanticEntropyEstimator`

`init()`

`compute_entropy(cluster_assignments, sequence_probs)`

`estimate_uncertainty(samples, seq_prob_mode=SeqProbMode.PROD, nli_model=None, nli_table=None, **kwargs)`

`SeqProbMode`

`TopKGapEstimator`

`estimate_uncertainty(samples, seq_prob_mode=SeqProbMode.PROD, k=2, **kwargs)`

`construct_nli_table(samples, nli_model)`

`entropy(probabilities)`

`hard_nli_clustering(samples, nli_table)`

`von_neumann_entropy(rho)`

`AzureCustomGPT4Wrapper`

`call(input)`

`init(openai_endpoint_url, api_key)`

`BatchLLMWrapper`

`call(*args, **kwargs)`

`ClaudeWrapper`

`call(prompt, model='claude-3-opus-20240229', temperature=1.0, max_tokens=1024)`

`init(api_key)`

`HFLLMWrapper`

`call(prompt, temperature=1.0, max_new_tokens=1024, *args, **kwargs)`

`init(tokenizer, model)`

`LLMOutput` `dataclass`

`LLMSamples` `dataclass`

`len()`

`LLMWrapper`

`call(*args, **kwargs)`

`all_logits_present(samples)`

`generate_n_samples_and_answer(llm, prompt, temp_gen=1.0, temp_answer=0.1, top_p_gen=0.9, top_k_gen=16, top_p_ans=0.7, top_k_ans=4, n_samples=10)`

`update_base_url(request, openai_endpoint_url)`

`GenerationDataset`

`init(data_path=None, arrow_table=None)`

`from_dataset(dataset)` `classmethod`

`load_from_json(data_path)` `staticmethod`

`split_dataset(train_size=0.8)`

`SeqProbMode`

`entropy(probabilities)`