Skip to content

Python API

bioclip.TreeOfLifeClassifier(**kwargs)

Bases: BaseClassifier

A classifier for predicting taxonomic ranks for images.

See BaseClassifier for details on **kwargs.

Source code in src/bioclip/predict.py
413
414
415
416
417
418
419
420
421
def __init__(self, **kwargs):
    """
    See `BaseClassifier` for details on `**kwargs`.
    """
    super().__init__(**kwargs)
    self.txt_embeddings = get_txt_emb().to(self.device)
    self.txt_names = get_txt_names()
    self._subset_txt_embeddings = None
    self._subset_txt_names = None

predict(images, rank, min_prob=1e-09, k=5)

Predicts probabilities for supplied taxa rank for given images using the Tree of Life embeddings.

Parameters:

Name Type Description Default
images List[str] | str | List[Image]

A list of image file paths, a single image file path, or a list of PIL Image objects.

required
rank Rank

The rank at which to make predictions (e.g., species, genus).

required
min_prob float

The minimum probability threshold for predictions.

1e-09
k int

The number of top predictions to return.

5

Returns:

Type Description
dict[str, dict[str, float]]

List[dict]: A list of dicts with keys "file_name", taxon ranks, "common_name", and "score".

Source code in src/bioclip/predict.py
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
@torch.no_grad()
def predict(self, images: List[str] | str | List[PIL.Image.Image], rank: Rank, min_prob: float = 1e-9, k: int = 5) -> dict[str, dict[str, float]]:
    """
    Predicts probabilities for supplied taxa rank for given images using the Tree of Life embeddings.

    Parameters:
        images (List[str] | str | List[PIL.Image.Image]): A list of image file paths, a single image file path, or a list of PIL Image objects.
        rank (Rank): The rank at which to make predictions (e.g., species, genus).
        min_prob (float, optional): The minimum probability threshold for predictions.
        k (int, optional): The number of top predictions to return.

    Returns:
        List[dict]: A list of dicts with keys "file_name", taxon ranks, "common_name", and "score".
    """

    if isinstance(images, str):
        images = [images]
    probs = self.create_probabilities_for_images(images, self.get_txt_embeddings())
    result = []
    for i, image in enumerate(images):
        key = self.make_key(image, i)
        if rank == Rank.SPECIES:
            result.extend(self.format_species_probs(key, probs[key], k))
        else:
            result.extend(self.format_grouped_probs(key, probs[key], rank, min_prob, k))
    return result

get_label_data()

Retrieves label data for the tree of life embeddings as a pandas DataFrame.

Returns:

Type Description
DataFrame

pd.DataFrame: A DataFrame containing label data for TOL embeddings.

Source code in src/bioclip/predict.py
437
438
439
440
441
442
443
444
445
446
447
448
def get_label_data(self) -> pd.DataFrame:
    """
    Retrieves label data for the tree of life embeddings as a pandas DataFrame.

    Returns:
        pd.DataFrame: A DataFrame containing label data for TOL embeddings.
    """

    data = []
    for name_ary in self.txt_names:
        data.append(create_classification_dict(names=name_ary, rank=Rank.SPECIES))
    return pd.DataFrame(data, copy=True)

create_taxa_filter(rank, user_values)

Creates a filter for taxa based on the specified rank and user-provided values.

Parameters:

Name Type Description Default
rank Rank

The taxonomic rank to filter by.

required
user_values List[str]

A list of user-provided values to filter the taxa.

required

Returns:

Type Description
List[bool]

List[bool]: A list of boolean values indicating whether each entry in the label data matches any of the user-provided values.

Raises:

Type Description
ValueError

If any of the user-provided values are not found in the label data for the specified taxonomic rank.

Source code in src/bioclip/predict.py
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
def create_taxa_filter(self, rank: Rank, user_values: List[str]) -> List[bool]:
    """
    Creates a filter for taxa based on the specified rank and user-provided values.

    Args:
        rank (Rank): The taxonomic rank to filter by.
        user_values (List[str]): A list of user-provided values to filter the taxa.

    Returns:
        List[bool]: A list of boolean values indicating whether each entry in the 
                    label data matches any of the user-provided values.

    Raises:
        ValueError: If any of the user-provided values are not found in the label data 
                    for the specified taxonomic rank.
    """

    taxa_column = rank.get_label()
    label_data = self.get_label_data()

    # Ensure all user values exist
    pd_user_values = pd.Series(user_values, name=taxa_column)
    unknown_values = pd_user_values[~pd_user_values.isin(label_data[taxa_column])]
    if not unknown_values.empty:
        bad_species = ", ".join(unknown_values.values)
        raise ValueError(f"Unknown {taxa_column} received: {bad_species}. Only known {taxa_column} may be used.")

    return label_data[taxa_column].isin(pd_user_values)

apply_filter(keep_labels_ary)

Filters the TOL embeddings based on the provided boolean array. See create_taxa_filter() for an easy way to create this parameter.

Parameters:

Name Type Description Default
keep_labels_ary List[bool]

A list of boolean values indicating which TOL embeddings to keep.

required

Raises:

Type Description
ValueError

If the length of keep_labels_ary does not match the expected length.

Source code in src/bioclip/predict.py
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
def apply_filter(self, keep_labels_ary: List[bool]):
    """
    Filters the TOL embeddings based on the provided boolean array. See `create_taxa_filter()` for an easy way to create this parameter.

    Args:
        keep_labels_ary (List[bool]): A list of boolean values indicating which 
                                      TOL embeddings to keep.

    Raises:
        ValueError: If the length of keep_labels_ary does not match the expected length.
    """

    if len(keep_labels_ary) != len(self.txt_names):
        expected = len(self.txt_names)
        raise ValueError("Invalid keep_embeddings values. " + 
                         f"This parameter should be a list containing {expected} items.")
    embeddings = []
    names = []
    for idx, keep in enumerate(keep_labels_ary):
        if keep:
            embeddings.append(self.txt_embeddings[:,idx])
            names.append(self.txt_names[idx])
    self._subset_txt_embeddings = torch.stack(embeddings, dim=1)
    self._subset_txt_names = names

bioclip.Rank

Rank for the Tree of Life classification.

  • KINGDOM
  • PHYLUM
  • CLASS
  • ORDER
  • FAMILY
  • GENUS
  • SPECIES

bioclip.CustomLabelsClassifier(cls_ary, **kwargs)

Bases: BaseClassifier

A classifier that predicts from a list of custom labels for images.

Initializes the classifier with the given class array and additional keyword arguments.

Parameters:

Name Type Description Default
cls_ary List[str]

A list of class names as strings.

required
Source code in src/bioclip/predict.py
274
275
276
277
278
279
280
281
282
283
284
def __init__(self, cls_ary: List[str], **kwargs):
    """
    Initializes the classifier with the given class array and additional keyword arguments.

    Parameters:
        cls_ary (List[str]): A list of class names as strings.
    """
    super().__init__(**kwargs)
    self.tokenizer = create_bioclip_tokenizer(self.model_str)
    self.classes = [cls.strip() for cls in cls_ary]
    self.txt_embeddings = self._get_txt_embeddings(self.classes)

predict(images, k=None)

Predicts the probabilities for the given images.

Parameters:

Name Type Description Default
images List[str] | str | List[Image]

A list of image file paths, a single image file path, or a list of PIL Image objects.

required
k int

The number of top probabilities to return. If not specified or if greater than the number of classes, all probabilities are returned.

None

Returns:

Type Description
dict[str, float]

List[dict]: A list of dicts with keys "file_name" and the custom class labels.

Source code in src/bioclip/predict.py
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
@torch.no_grad()
def predict(self, images: List[str] | str | List[PIL.Image.Image], k: int = None) -> dict[str, float]:
    """
    Predicts the probabilities for the given images.

    Parameters:
        images (List[str] | str | List[PIL.Image.Image]): A list of image file paths, a single image file path, or a list of PIL Image objects.
        k (int, optional): The number of top probabilities to return. If not specified or if greater than the number of classes, all probabilities are returned.

    Returns:
        List[dict]: A list of dicts with keys "file_name" and the custom class labels.
    """
    if isinstance(images, str):
        images = [images]
    probs = self.create_probabilities_for_images(images, self.txt_embeddings)
    result = []
    for i, image in enumerate(images):
        key = self.make_key(image, i)
        img_probs = probs[key]
        if not k or k > len(self.classes):
            k = len(self.classes)
        result.extend(self.group_probs(key, img_probs, k))
    return result

bioclip.CustomLabelsBinningClassifier(cls_to_bin, **kwargs)

Bases: CustomLabelsClassifier

A classifier that creates predictions for images based on custom labels, groups the labels, and calculates probabilities for each group.

Initializes the class with a dictionary mapping class labels to binary values.

Parameters:

Name Type Description Default
cls_to_bin dict

A dictionary where keys are class labels and values are binary values.

required
**kwargs

Additional keyword arguments passed to the superclass initializer.

{}

Raises:

Type Description
ValueError

If any value in cls_to_bin is empty, null, or NaN.

Source code in src/bioclip/predict.py
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
def __init__(self, cls_to_bin: dict, **kwargs):
    """
    Initializes the class with a dictionary mapping class labels to binary values.

    Args:
        cls_to_bin (dict): A dictionary where keys are class labels and values are binary values.
        **kwargs: Additional keyword arguments passed to the superclass initializer.

    Raises:
        ValueError: If any value in `cls_to_bin` is empty, null, or NaN.
    """
    super().__init__(cls_ary=cls_to_bin.keys(), **kwargs)
    self.cls_to_bin = cls_to_bin
    if any([pd.isna(x) or not x for x in cls_to_bin.values()]):
        raise ValueError("Empty, null, or nan are not allowed for bin values.")

bioclip.predict.BaseClassifier(model_str=BIOCLIP_MODEL_STR, pretrained_str=None, device='cpu')

Bases: Module

Initializes the prediction model.

Parameters:

Name Type Description Default
model_str str

The string identifier for the model to be used.

BIOCLIP_MODEL_STR
pretrained_str str

The string identifier for the pretrained model to be loaded.

None
device Union[str, device]

The device on which the model will be run.

'cpu'
Source code in src/bioclip/predict.py
179
180
181
182
183
184
185
186
187
188
189
190
def __init__(self, model_str: str = BIOCLIP_MODEL_STR, pretrained_str: str | None = None, device: Union[str, torch.device] = 'cpu'):
    """
    Initializes the prediction model.

    Parameters:
        model_str (str): The string identifier for the model to be used.
        pretrained_str (str, optional): The string identifier for the pretrained model to be loaded.
        device (Union[str, torch.device]): The device on which the model will be run.
    """
    super().__init__()
    self.device = device
    self.load_pretrained_model(model_str=model_str, pretrained_str=pretrained_str)

forward(x)

Given an input tensor representing multiple images, return probabilities for each class for each image. Args: x (torch.Tensor): Input tensor representing the multiple images. Returns: torch.Tensor: Softmax probabilities of the logits for each class for each image.

Source code in src/bioclip/predict.py
256
257
258
259
260
261
262
263
264
265
266
def forward(self, x: torch.Tensor) -> torch.Tensor:
    """
    Given an input tensor representing multiple images, return probabilities for each class for each image.
    Args:
        x (torch.Tensor): Input tensor representing the multiple images.
    Returns:
        torch.Tensor: Softmax probabilities of the logits for each class for each image.
    """
    img_features = self.model.encode_image(x)
    img_features = F.normalize(img_features, dim=-1)
    return self.create_probabilities(img_features, self.txt_embeddings)