arthur_bench.scoring#

class arthur_bench.scoring.ScoringMethodName(value, names=None, *, module=None, qualname=None, type=None, start=1, boundary=None)#

Bases: str, Enum

BERTScore = 'bertscore'#
ExactMatch = 'exact_match'#
Hallucination = 'hallucination'#
HedgingLanguage = 'hedging_language'#
PythonUnitTesting = 'python_unit_testing'#
QACorrectness = 'qa_correctness'#
Readability = 'readability'#
Specificity = 'specificity'#
SummaryQuality = 'summary_quality'#
WordCountMatch = 'word_count_match'#
arthur_bench.scoring.scorer_from_string(method: str) type[arthur_bench.scoring.scorer.Scorer]#

Submodules#

class arthur_bench.scoring.bertscore.BERTScore(model_type='microsoft/deberta-v3-base', precision_weight=0.1)#

Bases: Scorer

Tailored bert score implementation.

https://arxiv.org/abs/1904.09675

static name() str#

Get the name of this Scorer :return: the Scorer name

run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[ScoreResult]#

Score a batch of candidate generations.

Parameters:
  • candidate_batch – candidate generations to score

  • reference_batch – reference strings representing target outputs

  • input_text_batch – optional corresponding inputs

  • context_batch – optional corresponding contexts, if needed by scorer

Returns:

scoring results for this batch. Float scores are deprecated, use ScoreResult instead

to_dict(warn=False)#

Provides a json serializable representation of the scorer.

class arthur_bench.scoring.exact_match.ExactMatch(case_sensitive=True)#

Bases: Scorer

Returns 1 if candidate matches reference, 0 if candidate does not match reference.

static categories() List[Category]#

All possible values returned by the scorer if output type is categorical.

static is_categorical() bool#

Whether the scorer is continuous or categorical. categories() should be implemented if True

static name() str#

Get the name of this Scorer :return: the Scorer name

run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[ScoreResult]#

Score a batch of candidate generations.

Parameters:
  • candidate_batch – candidate generations to score

  • reference_batch – reference strings representing target outputs

  • input_text_batch – optional corresponding inputs

  • context_batch – optional corresponding contexts, if needed by scorer

Returns:

scoring results for this batch. Float scores are deprecated, use ScoreResult instead

class arthur_bench.scoring.hallucination.Hallucination#

Bases: Scorer

Score each output against a context using Arthur’s hosted hallucination checker A score of 1.0 means the hallucination checker estimates the output is supported by the context A score of 0.0 means the hallucination checker found information in the output unsupported by the context

static categories() List[Category]#

All possible values returned by the scorer if output type is categorical.

static is_categorical() bool#

Whether the scorer is continuous or categorical. categories() should be implemented if True

static name() str#

Get the name of this Scorer :return: the Scorer name

static requires_reference() bool#

True if scorer requires reference output to compute score, False otherwise

run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[ScoreResult]#

Score a batch of candidate generations.

Parameters:
  • candidate_batch – candidate generations to score

  • reference_batch – reference strings representing target outputs

  • input_text_batch – optional corresponding inputs

  • context_batch – optional corresponding contexts, if needed by scorer

Returns:

scoring results for this batch. Float scores are deprecated, use ScoreResult instead

to_dict(warn=False)#

Provides a json serializable representation of the scorer.

class arthur_bench.scoring.hedging_language.HedgingLanguage(model_type: str = 'microsoft/deberta-v3-base', hedging_language: str = "As an AI language model, I don't have personal opinions, emotions, or beliefs.")#

Bases: Scorer

Given an input question and model output, determine if the output contains hedging language such as “As an AI language model, I don’t have personal opinions, emotions, or beliefs”. The values returned are a similarity score (BERTScore), with higher values corresponding to higher likelihood of hedging language being present in the model output.

static name() str#

Get the name of this Scorer :return: the Scorer name

static requires_reference() bool#

True if scorer requires reference output to compute score, False otherwise

run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[ScoreResult]#

Score a batch of candidate generations.

Parameters:
  • candidate_batch – candidate generations to score

  • reference_batch – reference strings representing target outputs

  • input_text_batch – optional corresponding inputs

  • context_batch – optional corresponding contexts, if needed by scorer

Returns:

scoring results for this batch. Float scores are deprecated, use ScoreResult instead

to_dict(warn=False)#

Provides a json serializable representation of the scorer.

class arthur_bench.scoring.python_unit_testing.PythonUnitTesting(unit_test_dir: str | None = None, unit_tests: List[str] | None = None)#

Bases: Scorer

Wrapping the HuggingFace code_eval metric

Scores each candidate_output as a function against a pre-prepared unit test

Note: considers any code with non-standard python libraries (e.g. numpy) to have an error

https://huggingface.co/spaces/evaluate-metric/code_eval

static categories() List[Category]#

All possible values returned by the scorer if output type is categorical.

static is_categorical() bool#

Whether the scorer is continuous or categorical. categories() should be implemented if True

static name() str#

Get the name of this Scorer :return: the Scorer name

static requires_reference() bool#

True if scorer requires reference output to compute score, False otherwise

run(candidate_outputs: List[str], reference_outputs: List[str] | None = None, inputs: List[str] | None = None, contexts: List[str] | None = None, batch_size: int = 1) List[ScoreResult]#

Score a set of test cases. This method doesn’t need to be implemented in most cases, but can be overriden to add additional functionality such as task-specific logging.

Parameters:
  • candidate_outputs – candidate generations to score

  • reference_outputs – reference strings representing target outputs

  • inputs – input strings being tested

  • contexts – optional corresponding contexts, if needed by scorer

  • batch_size – size of batches

Returns:

scoring results for this run. Float scores are deprecated, use ScoreResult instead

run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[ScoreResult]#

Score a batch of candidate generations.

Parameters:
  • candidate_batch – candidate generations to score

  • reference_batch – reference strings representing target outputs

  • input_text_batch – optional corresponding inputs

  • context_batch – optional corresponding contexts, if needed by scorer

Returns:

scoring results for this batch. Float scores are deprecated, use ScoreResult instead

to_dict(warn=False)#

Provides a json serializable representation of the scorer.

class arthur_bench.scoring.qa_quality.QAQualityCorrectness(llm: BaseChatModel | None = None)#

Bases: Scorer

Given an input question, context string, and model generation, determine if the generation produced a correct answer.

async arun_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[float] | List[ScoreResult]#

Reference batch is not used for this scoring method, QA correctness requires an input_text_batch and context_batch

static categories() List[Category]#

All possible values returned by the scorer if output type is categorical.

static is_categorical() bool#

Whether the scorer is continuous or categorical. categories() should be implemented if True

static name() str#

Get the name of this Scorer :return: the Scorer name

static requires_reference() bool#

True if scorer requires reference output to compute score, False otherwise

run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[ScoreResult]#

Reference batch is not used for this scoring method, QA correctness requires an input_text_batch and context_batch

to_dict(warn=False)#

Provides a json serializable representation of the scorer.

static validate_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) Tuple[List[str], List[str]]#
class arthur_bench.scoring.readability.Readability#

Bases: Scorer

Flesch Reading Ease Score: the higher the score, the easier to read. Scores of 100-90 correlate to a 5th grade reading level, while scores <10 are classified as “Extremely difficult to read, and best understood by university graduates.”

https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests

static name() str#

Get the name of this Scorer :return: the Scorer name

static requires_reference() bool#

True if scorer requires reference output to compute score, False otherwise

run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[ScoreResult]#

Score a batch of candidate generations.

Parameters:
  • candidate_batch – candidate generations to score

  • reference_batch – reference strings representing target outputs

  • input_text_batch – optional corresponding inputs

  • context_batch – optional corresponding contexts, if needed by scorer

Returns:

scoring results for this batch. Float scores are deprecated, use ScoreResult instead

class arthur_bench.scoring.scorer.Scorer#

Bases: ABC

Base class for all scorers. Compute a float score for a given model generation.

async arun(candidate_outputs: List[str], reference_outputs: List[str] | None = None, inputs: List[str] | None = None, contexts: List[str] | None = None, batch_size: int = 5) List[float] | List[ScoreResult]#

Async version of run method.

async arun_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[float] | List[ScoreResult]#

Async version of run_batch method.

static categories() List[Category] | None#

All possible values returned by the scorer if output type is categorical.

classmethod from_dict(config: dict)#

Load a scorer from a json configuration file.

static is_categorical() bool#

Whether the scorer is continuous or categorical. categories() should be implemented if True

abstract static name() str#

Get the name of this Scorer :return: the Scorer name

static requires_reference() bool#

True if scorer requires reference output to compute score, False otherwise

run(candidate_outputs: List[str], reference_outputs: List[str] | None = None, inputs: List[str] | None = None, contexts: List[str] | None = None, batch_size: int = 1) List[float] | List[ScoreResult]#

Score a set of test cases. This method doesn’t need to be implemented in most cases, but can be overriden to add additional functionality such as task-specific logging.

Parameters:
  • candidate_outputs – candidate generations to score

  • reference_outputs – reference strings representing target outputs

  • inputs – input strings being tested

  • contexts – optional corresponding contexts, if needed by scorer

  • batch_size – size of batches

Returns:

scoring results for this run. Float scores are deprecated, use ScoreResult instead

abstract run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[float] | List[ScoreResult]#

Score a batch of candidate generations.

Parameters:
  • candidate_batch – candidate generations to score

  • reference_batch – reference strings representing target outputs

  • input_text_batch – optional corresponding inputs

  • context_batch – optional corresponding contexts, if needed by scorer

Returns:

scoring results for this batch. Float scores are deprecated, use ScoreResult instead

to_dict(warn=False)#

Provides a json serializable representation of the scorer.

to_metadata() ScoringMethod#
classmethod type() ScoringMethodType#

Supplies whether a scorer is built-in or custom.

This method is implemented by checking whether the Scorer class is part of the arthur_bench.scoring module. :return: the type (built-in or custom)

class arthur_bench.scoring.specificity.Specificity#

Bases: Scorer

Returns a score from 0.0 to 1.0 indicating how specific the candidate output language is. Higher scores indicate that the language is more specific, Lower scores indicate more vague language.

Specificity is computed through detecting words that indicate vagueness (predefined) determing how rare the words used are according to word frequencies calculated by popular nlp corpora, and detecting use of proper nouns and numbers.

get_mean_word_freq(candidate_output: str) float#

Returns mean word frequency of candidate output. Higher values indicate that moree common words on average are used in the candidate output. Considers only words with frequency <0.001, truncating probability of words with higher frequencies to 0.001.

get_num_vague_words(candidate_output: str) int#

Returns number of words in candidate_output which are is a list of pre-defined vague words.

get_pn_and_num(candidate_output: str) int#

Returns total number of Proper Nouns and Numbers in candidate output. Determined heuristically via NNP and CD nltk tags.

static name() str#

Get the name of this Scorer :return: the Scorer name

static requires_reference() bool#

True if scorer requires reference output to compute score, False otherwise

run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[ScoreResult]#

Score a batch of candidate generations.

Parameters:
  • candidate_batch – candidate generations to score

  • reference_batch – reference strings representing target outputs

  • input_text_batch – optional corresponding inputs

  • context_batch – optional corresponding contexts, if needed by scorer

Returns:

scoring results for this batch. Float scores are deprecated, use ScoreResult instead

class arthur_bench.scoring.summary_quality.SummaryQuality(llm: BaseChatModel | None = None, context_window: int = 4096, tokenizer: Encoding | None = None)#

Bases: Scorer

Comprehensive measure of summarization quality compared to a reference summary.

async arun(candidate_outputs: List[str], reference_outputs: List[str] | None = None, inputs: List[str] | None = None, contexts: List[str] | None = None, batch_size: int = 5) List[float] | List[ScoreResult]#

Async version of run method.

async arun_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[float] | List[ScoreResult]#

Summary quality requires input_text_batch. Asynchronous implementation

static categories() List[Category]#

All possible values returned by the scorer if output type is categorical.

static is_categorical() bool#

Whether the scorer is continuous or categorical. categories() should be implemented if True

static name() str#

Get the name of this Scorer :return: the Scorer name

run(candidate_outputs: List[str], reference_outputs: List[str] | None = None, inputs: List[str] | None = None, contexts: List[str] | None = None, batch_size: int = 1) List[ScoreResult] | List[float]#

Score a set of test cases. This method doesn’t need to be implemented in most cases, but can be overriden to add additional functionality such as task-specific logging.

Parameters:
  • candidate_outputs – candidate generations to score

  • reference_outputs – reference strings representing target outputs

  • inputs – input strings being tested

  • contexts – optional corresponding contexts, if needed by scorer

  • batch_size – size of batches

Returns:

scoring results for this run. Float scores are deprecated, use ScoreResult instead

run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[ScoreResult]#

Summary quality requires input_text_batch.

to_dict(warn=False)#

Provides a json serializable representation of the scorer.

static validate_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) Tuple[List[str], List[str]]#
arthur_bench.scoring.summary_quality.truncate_input_text(input_text, ref_output, cand_output, context_window: int = 4096, tokenizer: ~tiktoken.core.Encoding = <Encoding 'cl100k_base'>) Tuple[str, bool]#

Truncates the input_text to fit in LLM evaluator context

Truncate the input text so that the filled-in COMPARE prompt which contains {input text + summary A + summary B} fits in the evaluator context window

Returns the tuple (text, whether text was truncated)

class arthur_bench.scoring.utils.suppress_warnings(logger_name: str)#

Bases: object

A context-manager class to temporarily set the logging level for a logger to ERROR before returning it to its previous state.

class arthur_bench.scoring.word_count_match.WordCountMatch#

Bases: Scorer

Calculates how similar the number of words in the candidate output is to the the number of words in the reference output. Scores span from 0 to 1. A score of 1.0 indicates that there are the same number of words in the candidate output as in the reference output. Scores less than 1.0 are calculated as ((len_reference-delta)/len_reference) where delta is the absolute difference in word lengths between the candidate and reference outputs. All negative computed values are truncated to 0. Utilizes lexicon count, removing punctuations: https://pypi.org/project/textstat/

static name() str#

Get the name of this Scorer :return: the Scorer name

run_batch(candidate_batch: List[str], reference_batch: List[str] | None = None, input_text_batch: List[str] | None = None, context_batch: List[str] | None = None) List[ScoreResult]#

Score a batch of candidate generations.

Parameters:
  • candidate_batch – candidate generations to score

  • reference_batch – reference strings representing target outputs

  • input_text_batch – optional corresponding inputs

  • context_batch – optional corresponding contexts, if needed by scorer

Returns:

scoring results for this batch. Float scores are deprecated, use ScoreResult instead