arthur_bench.run#

Submodules#

class arthur_bench.run.testrun.TestRun(*, name: str, test_case_outputs: List[TestCaseOutput], description: str | None = None, model_name: str | None = None, foundation_model: str | None = None, prompt_template: str | None = None, model_version: str | None = None, test_suite_id: UUID, client: BenchClient, id: UUID | None = None)#

Bases: CreateRunRequest

class Config#

Bases: object

arbitrary_types_allowed = True#

property categories: List[str | None]#

client: BenchClient#

classmethod from_flattened(run_name: str, ids: List[UUID], candidate_output_list: List[str], scores: List[float] | List[ScoreResult], client: BenchClient, test_suite_id: UUID, model_name: str | None = None, model_version: str | None = None, foundation_model: str | None = None, prompt_template: str | None = None)#

id: UUID | None#

property output: List[str]#

save() → UUID#: Save a test run.

property scores: List[float | None]#

test_suite_id: UUID#

class arthur_bench.run.testsuite.TestSuite(name: str, scoring_method: str | Scorer, description: str | None = None, reference_data: DataFrame | None = None, reference_data_path: str | None = None, input_column: str = 'input', reference_column: str = 'reference_output', input_text_list: List[str] | None = None, reference_output_list: List[str] | None = None, client: BenchClient | None = None)#

Bases: object

Reusable pipeline for running a test suite built from reference_data and evaluated using scoring_method

Parameters:

name – name of the test suite
scoring_method – scoring method or scorer instance to use to evaluate the results of a test run, as a string/enum or class instance
description – short description of the task tested by this suite
reference_data – dataframe of prompts and reference outputs
reference_data_path – filepath to csv of prompts and reference outputs, required if not specifying reference_data
input_column – the column of reference_data containing prompts, defaults to ‘prompt’
reference_column – the column of reference_data containing reference outputs, defaults to ‘reference’
input_text_list – list of strings of input texts that can be provided instead of dataframe columns
reference_output_list – list of strings of reference outputs that can be provided instead of dataframe columns

async arun(run_name: str, candidate_data: DataFrame | None = None, candidate_data_path: str | None = None, candidate_column: str = 'candidate_output', candidate_output_list: List[str] | None = None, context_column: str | None = None, context_list: List[str] | None = None, save: bool = True, batch_size: int = 5, model_name: str | None = None, model_version: str | None = None, foundation_model: str | None = None, prompt_template: str | None = None) → TestRun#

property description: str | None#

property input_texts: List[str]#

property name: str#

property reference_outputs: List[str | None]#

run(run_name: str, candidate_data: DataFrame | None = None, candidate_data_path: str | None = None, candidate_column: str = 'candidate_output', candidate_output_list: List[str] | None = None, context_column: str | None = None, context_list: List[str] | None = None, save: bool = True, batch_size: int = 1, model_name: str | None = None, model_version: str | None = None, foundation_model: str | None = None, prompt_template: str | None = None) → TestRun#

Score a test run on candidate outputs.

Parameters:

run_name – name for the test run
candidate_data – dataframe of candidate responses to test prompts
candidate_data_path – filepath to csv containing candidate responses to test prompts
candidate_column – the column of candidate data containing candidate responses, defaults to ‘candidate_output’
candidate_output_list – list of strings of candidate outputs that can be provided instead of dataframe
context_column – the column of reference_data containing supporting context for answering Question & Answering tasks
context_list – list of strings containing supporting context for answering question and answering tasks
save – whether to save the run results to file
batch_size – the batch_size to use when computing scores
model_name – model name for model used to generate outputs
model_version – model version of model used to generate outputs
foundation_model – foundation model name used to generate outputs
prompt_template – prompt template name used to generate outputs

Returns:

TestRun object containing scored outputs

save()#: Save a test suite to local file system.

property scoring_method: str#

property test_cases: List[TestCaseResponse]#