Skip to content

Python API

bioclip.TreeOfLifeClassifier(**kwargs)

Bases: BaseClassifier

A classifier for predicting taxonomic ranks for images.

See BaseClassifier for details on **kwargs.

Source code in src/bioclip/predict.py
505
506
507
508
509
510
511
512
513
def __init__(self, **kwargs):
    """
    See `BaseClassifier` for details on `**kwargs`.
    """
    super().__init__(**kwargs)
    self.txt_embeddings = self.get_txt_emb().to(self.device)
    self.txt_names = self.get_txt_names()
    self._subset_txt_embeddings = None
    self._subset_txt_names = None

predict(images, rank, min_prob=1e-09, k=5, batch_size=10)

Predicts probabilities for supplied taxa rank for given images using the Tree of Life embeddings.

Parameters:

Name Type Description Default
images List[str] | str | List[Image]

A list of image file paths, a single image file path, or a list of PIL Image objects.

required
rank Rank

The rank at which to make predictions (e.g., species, genus).

required
min_prob float

The minimum probability threshold for predictions.

1e-09
k int

The number of top predictions to return.

5
batch_size int

The number of images to process in a batch.

10

Returns:

Type Description
dict[str, dict[str, float]]

List[dict]: A list of dicts with keys "file_name", taxon ranks, "common_name", and "score".

Source code in src/bioclip/predict.py
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
@torch.no_grad()
def predict(self, images: List[str] | str | List[PIL.Image.Image], rank: Rank, 
            min_prob: float = 1e-9, k: int = 5, batch_size: int = 10) -> dict[str, dict[str, float]]:
    """
    Predicts probabilities for supplied taxa rank for given images using the Tree of Life embeddings.

    Parameters:
        images (List[str] | str | List[PIL.Image.Image]): A list of image file paths, a single image file path, or a list of PIL Image objects.
        rank (Rank): The rank at which to make predictions (e.g., species, genus).
        min_prob (float, optional): The minimum probability threshold for predictions.
        k (int, optional): The number of top predictions to return.
        batch_size (int, optional): The number of images to process in a batch.

    Returns:
        List[dict]: A list of dicts with keys "file_name", taxon ranks, "common_name", and "score".
    """

    if isinstance(images, str):
        images = [images]
    probs = self.create_batched_probabilities_for_images(images, self.get_txt_embeddings(),
                                                         batch_size=batch_size)
    result = []
    for i, image in enumerate(images):
        key = self.make_key(image, i)
        image_probs = probs[key].cpu()
        if rank == Rank.SPECIES:
            result.extend(self.format_species_probs(key, image_probs, k))
        else:
            result.extend(self.format_grouped_probs(key, image_probs, rank, min_prob, k))
    self.record_event(images=images, rank=rank.get_label(), min_prob=min_prob, k=k, batch_size=batch_size)
    return result

get_label_data()

Retrieves label data for the tree of life embeddings as a pandas DataFrame.

Returns:

Type Description
DataFrame

pd.DataFrame: A DataFrame containing label data for TOL embeddings.

Source code in src/bioclip/predict.py
529
530
531
532
533
534
535
536
537
538
539
540
def get_label_data(self) -> pd.DataFrame:
    """
    Retrieves label data for the tree of life embeddings as a pandas DataFrame.

    Returns:
        pd.DataFrame: A DataFrame containing label data for TOL embeddings.
    """

    data = []
    for name_ary in self.txt_names:
        data.append(create_classification_dict(names=name_ary, rank=Rank.SPECIES))
    return pd.DataFrame(data, copy=True)

create_taxa_filter(rank, user_values)

Creates a filter for taxa based on the specified rank and user-provided values.

Parameters:

Name Type Description Default
rank Rank

The taxonomic rank to filter by.

required
user_values List[str]

A list of user-provided values to filter the taxa.

required

Returns:

Type Description
List[bool]

List[bool]: A list of boolean values indicating whether each entry in the label data matches any of the user-provided values.

Raises:

Type Description
ValueError

If any of the user-provided values are not found in the label data for the specified taxonomic rank.

Source code in src/bioclip/predict.py
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
def create_taxa_filter(self, rank: Rank, user_values: List[str]) -> List[bool]:
    """
    Creates a filter for taxa based on the specified rank and user-provided values.

    Args:
        rank (Rank): The taxonomic rank to filter by.
        user_values (List[str]): A list of user-provided values to filter the taxa.

    Returns:
        List[bool]: A list of boolean values indicating whether each entry in the 
                    label data matches any of the user-provided values.

    Raises:
        ValueError: If any of the user-provided values are not found in the label data 
                    for the specified taxonomic rank.
    """

    taxa_column = rank.get_label()
    label_data = self.get_label_data()

    # Ensure all user values exist
    pd_user_values = pd.Series(user_values, name=taxa_column)
    unknown_values = pd_user_values[~pd_user_values.isin(label_data[taxa_column])]
    if not unknown_values.empty:
        bad_species = ", ".join(unknown_values.values)
        raise ValueError(f"Unknown {taxa_column} received: {bad_species}. Only known {taxa_column} may be used.")

    return label_data[taxa_column].isin(pd_user_values)

apply_filter(keep_labels_ary)

Filters the TOL embeddings based on the provided boolean array. See create_taxa_filter() for an easy way to create this parameter.

Parameters:

Name Type Description Default
keep_labels_ary List[bool]

A list of boolean values indicating which TOL embeddings to keep.

required

Raises:

Type Description
ValueError

If the length of keep_labels_ary does not match the expected length.

Source code in src/bioclip/predict.py
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
def apply_filter(self, keep_labels_ary: List[bool]):
    """
    Filters the TOL embeddings based on the provided boolean array. See `create_taxa_filter()` for an easy way to create this parameter.

    Args:
        keep_labels_ary (List[bool]): A list of boolean values indicating which 
                                      TOL embeddings to keep.

    Raises:
        ValueError: If the length of keep_labels_ary does not match the expected length.
    """

    if len(keep_labels_ary) != len(self.txt_names):
        expected = len(self.txt_names)
        raise ValueError("Invalid keep_embeddings values. " + 
                         f"This parameter should be a list containing {expected} items.")
    embeddings = []
    names = []
    for idx, keep in enumerate(keep_labels_ary):
        if keep:
            embeddings.append(self.txt_embeddings[:,idx])
            names.append(self.txt_names[idx])
    self._subset_txt_embeddings = torch.stack(embeddings, dim=1)
    self._subset_txt_names = names

bioclip.Rank

Rank for the Tree of Life classification.

  • KINGDOM
  • PHYLUM
  • CLASS
  • ORDER
  • FAMILY
  • GENUS
  • SPECIES

bioclip.CustomLabelsClassifier(cls_ary, **kwargs)

Bases: BaseClassifier

A classifier that predicts from a list of custom labels for images.

Initializes the classifier with the given class array and additional keyword arguments.

Parameters:

Name Type Description Default
cls_ary List[str]

A list of class names as strings.

required
Source code in src/bioclip/predict.py
361
362
363
364
365
366
367
368
369
370
371
def __init__(self, cls_ary: List[str], **kwargs):
    """
    Initializes the classifier with the given class array and additional keyword arguments.

    Parameters:
        cls_ary (List[str]): A list of class names as strings.
    """
    super().__init__(**kwargs)
    self.tokenizer = create_bioclip_tokenizer(self.model_str)
    self.classes = [cls.strip() for cls in cls_ary]
    self.txt_embeddings = self._get_txt_embeddings(self.classes)

predict(images, k=None, batch_size=10)

Predicts the probabilities for the given images.

Parameters:

Name Type Description Default
images List[str] | str | List[Image]

A list of image file paths, a single image file path, or a list of PIL Image objects.

required
k int

The number of top probabilities to return. If not specified or if greater than the number of classes, all probabilities are returned.

None
batch_size int

The number of images to process in a batch.

10

Returns:

Type Description
dict[str, float]

List[dict]: A list of dicts with keys "file_name" and the custom class labels.

Source code in src/bioclip/predict.py
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
@torch.no_grad()
def predict(self, images: List[str] | str | List[PIL.Image.Image], k: int = None,
            batch_size: int = 10) -> dict[str, float]:
    """
    Predicts the probabilities for the given images.

    Parameters:
        images (List[str] | str | List[PIL.Image.Image]): A list of image file paths, a single image file path, or a list of PIL Image objects.
        k (int, optional): The number of top probabilities to return. If not specified or if greater than the number of classes, all probabilities are returned.
        batch_size (int, optional): The number of images to process in a batch.

    Returns:
        List[dict]: A list of dicts with keys "file_name" and the custom class labels.
    """
    if isinstance(images, str):
        images = [images]
    probs = self.create_batched_probabilities_for_images(images, self.txt_embeddings,
                                                         batch_size=batch_size)
    result = []
    for i, image in enumerate(images):
        key = self.make_key(image, i)
        img_probs = probs[key]
        if not k or k > len(self.classes):
            k = len(self.classes)
        result.extend(self.group_probs(key, img_probs, k))

    self.record_event(images=images, k=k, batch_size=batch_size)
    return result

bioclip.CustomLabelsBinningClassifier(cls_to_bin, **kwargs)

Bases: CustomLabelsClassifier

A classifier that creates predictions for images based on custom labels, groups the labels, and calculates probabilities for each group.

Initializes the class with a dictionary mapping class labels to binary values.

Parameters:

Name Type Description Default
cls_to_bin dict

A dictionary where keys are class labels and values are binary values.

required
**kwargs

Additional keyword arguments passed to the superclass initializer.

{}

Raises:

Type Description
ValueError

If any value in cls_to_bin is empty, null, or NaN.

Source code in src/bioclip/predict.py
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
def __init__(self, cls_to_bin: dict, **kwargs):
    """
    Initializes the class with a dictionary mapping class labels to binary values.

    Args:
        cls_to_bin (dict): A dictionary where keys are class labels and values are binary values.
        **kwargs: Additional keyword arguments passed to the superclass initializer.

    Raises:
        ValueError: If any value in `cls_to_bin` is empty, null, or NaN.
    """
    super().__init__(cls_ary=cls_to_bin.keys(), **kwargs)
    self.cls_to_bin = cls_to_bin
    if any([pd.isna(x) or not x for x in cls_to_bin.values()]):
        raise ValueError("Empty, null, or nan are not allowed for bin values.")

bioclip.predict.BaseClassifier(model_str=BIOCLIP_MODEL_STR, pretrained_str=None, device='cpu')

Bases: Module

Initializes the prediction model.

Parameters:

Name Type Description Default
model_str str

The string identifier for the model to be used (defaults to BIOCLIP_MODEL_STR).

BIOCLIP_MODEL_STR
pretrained_str str

The string identifier for the pretrained model to be loaded.

None
device Union[str, device]

The device on which the model will be run.

'cpu'
Source code in src/bioclip/predict.py
200
201
202
203
204
205
206
207
208
209
210
211
212
def __init__(self, model_str: str = BIOCLIP_MODEL_STR, pretrained_str: str | None = None, device: Union[str, torch.device] = 'cpu'):
    """
    Initializes the prediction model.

    Parameters:
        model_str (str): The string identifier for the model to be used (defaults to BIOCLIP_MODEL_STR).
        pretrained_str (str, optional): The string identifier for the pretrained model to be loaded.
        device (Union[str, torch.device]): The device on which the model will be run.
    """
    super().__init__()
    self.device = device
    self.load_pretrained_model(model_str=model_str, pretrained_str=pretrained_str)
    self.recorder = None

forward(x)

Given an input tensor representing multiple images, return probabilities for each class for each image. Args: x (torch.Tensor): Input tensor representing the multiple images. Returns: torch.Tensor: Softmax probabilities of the logits for each class for each image.

Source code in src/bioclip/predict.py
303
304
305
306
307
308
309
310
311
312
313
def forward(self, x: torch.Tensor) -> torch.Tensor:
    """
    Given an input tensor representing multiple images, return probabilities for each class for each image.
    Args:
        x (torch.Tensor): Input tensor representing the multiple images.
    Returns:
        torch.Tensor: Softmax probabilities of the logits for each class for each image.
    """
    img_features = self.model.encode_image(x)
    img_features = F.normalize(img_features, dim=-1)
    return self.create_probabilities(img_features, self.txt_embeddings)

get_cached_datafile(filename)

Downloads a datafile from the Hugging Face hub and caches it locally. Args: filename (str): The name of the file to download from the datafile repository. Returns: str: The local path to the downloaded file.

Source code in src/bioclip/predict.py
325
326
327
328
329
330
331
332
333
def get_cached_datafile(self, filename: str) -> str:
    """
    Downloads a datafile from the Hugging Face hub and caches it locally.
    Args:
        filename (str): The name of the file to download from the datafile repository.
    Returns:
        str: The local path to the downloaded file.
    """
    return hf_hub_download(repo_id=self.get_tol_repo_id(), filename=filename, repo_type=HF_DATAFILE_REPO_TYPE)

get_tol_repo_id()

Returns the repository ID for the TreeOfLife datafile based on the model string. Raises: ValueError: If the model string is not supported. Returns: str: The Hugging Face repository ID for the TreeOfLife embeddings.

Source code in src/bioclip/predict.py
315
316
317
318
319
320
321
322
323
def get_tol_repo_id(self) -> str:
    """
    Returns the repository ID for the TreeOfLife datafile based on the model string.
    Raises:
        ValueError: If the model string is not supported.
    Returns:
        str: The Hugging Face repository ID for the TreeOfLife embeddings.
    """
    return get_tol_repo_id(self.model_str)

get_txt_emb()

Retrieves TreeOfLife text embeddings for the current model from the associated Hugging Face dataset repo. Returns: torch.Tensor: A tensor containing the text embeddings for the tree of life.

Source code in src/bioclip/predict.py
335
336
337
338
339
340
341
342
def get_txt_emb(self) -> torch.Tensor:
    """
    Retrieves TreeOfLife text embeddings for the current model from the associated Hugging Face dataset repo.
    Returns:
        torch.Tensor: A tensor containing the text embeddings for the tree of life.
    """
    txt_emb_npy = self.get_cached_datafile("embeddings/txt_emb_species.npy")
    return torch.from_numpy(np.load(txt_emb_npy))

get_txt_names()

Retrieves TreeOfLife text names for the current model from the associated Hugging Face dataset repo. Returns: List[List[str]]: A list of lists, where each inner list contains names corresponding to the text embeddings.

Source code in src/bioclip/predict.py
344
345
346
347
348
349
350
351
352
353
def get_txt_names(self) -> List[List[str]]:
    """
    Retrieves TreeOfLife text names for the current model from the  associated Hugging Face dataset repo.
    Returns:
        List[List[str]]: A list of lists, where each inner list contains names corresponding to the text embeddings.
    """
    txt_names_json = self.get_cached_datafile("embeddings/txt_emb_species.json")
    with open(txt_names_json) as fd:
        txt_names = json.load(fd)
    return txt_names

bioclip.recorder

Records predictions made by a classifier and saves the output to a file.

attach_prediction_recorder(classifier, **top_level_settings)

Attach a PredictionRecorder to the classifier instance that will record metadata and subsequent predictions. Call save_recorded_predictions to save the recorded predictions to a file.

Parameters:

Name Type Description Default
classifier object

The classifier (such as TreeOfLifeClassifier) instance to attach the recorder to.

required
**top_level_settings

Additional settings to be recorded.

{}

Returns:

Name Type Description
PredictionRecorder

An instance of PredictionRecorder attached to the classifier.

Source code in src/bioclip/recorder.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
def attach_prediction_recorder(classifier: object, **top_level_settings):
    """
    Attach a PredictionRecorder to the classifier instance that will record metadata and subsequent predictions.
    Call save_recorded_predictions to save the recorded predictions to a file.

    Args:
        classifier (object): The classifier (such as TreeOfLifeClassifier) instance to attach the recorder to.
        **top_level_settings: Additional settings to be recorded.

    Returns:
        PredictionRecorder: An instance of PredictionRecorder attached to the classifier.
    """
    recorder = PredictionRecorder(classifier, **top_level_settings)
    classifier.set_recorder(recorder)
    return recorder

save_recorded_predictions(classifier, path, include_command_line=True)

Saves recorded predictions from the classifier to a file. Before calling this function, ensure that the classifier has a recorder attached using attach_prediction_recorder. Saves the recorder's data to the specified file path in either JSON or plain text format. If the file extension is '.json', the data is serialized as JSON. Otherwise, the data is appended in a human-readable text format.

Parameters:

Name Type Description Default
classifier object

The classifier instance (such as TreeOfLifeClassifier) with recorded predictions.

required
path str

The file path where the report will be saved.

required
include_command_line bool

When True includes the python command line in the log file.

True

Raises:

Type Description
ValueError

If the output path extension is .json and the file already exists.

Source code in src/bioclip/recorder.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
def save_recorded_predictions(classifier: object, path: str, include_command_line: bool = True):
    """
    Saves recorded predictions from the classifier to a file.
    Before calling this function, ensure that the classifier has a recorder attached
    using attach_prediction_recorder. Saves the recorder's data to the specified file path in 
    either JSON or plain text format. If the file extension is '.json', the data is serialized
    as JSON. Otherwise, the data is appended in a human-readable text format.

    Args:
        classifier (object): The classifier instance (such as TreeOfLifeClassifier) with recorded predictions.
        path (str): The file path where the report will be saved.
        include_command_line (bool): When True includes the python command line in the log file.

    Raises:
        ValueError: If the output path extension is .json and the file already exists.
    """
    if classifier.recorder:
        command_line = " ".join(sys.argv) if include_command_line else None
        classifier.recorder.create_report(path, command_line=command_line)
    else:
        raise ValueError("The classifier does not have a recorder attached.")