Source code for wsipipe.datasets.stripai

"""
This module creates the dataframe for the STRIP AI dataset with the following columns:
    - The slide column stores the paths on disk of the whole slide images
    - The annotation column records a string with the slide label 
    - The label column is the slide level label
    - The tags column contains the center and patient for each slide

This assumes there is a folder on disk structured the same as downloading
from the kaggle website
https://www.kaggle.com/competitions/mayo-clinic-strip-ai/data

"""

import os
from pathlib import Path
import pandas as pd

[docs]def convert_to_pyramids(data_root: Path = Path("data", "mayo-clinic-strip-ai"), out_root: Path = Path("experiments", "mayo_pyramids"), project_root: Path = None):
    """ Create pyramids for whole slide images

    The whole slide images as downloaded only contain data at level 0, 
    no other levels are present. This can make it slow to access the slides.
    This function will run over all the slides in the the dataset and write
    out copies that contain a pyramid of levels.
    Files are written to folder experiments/pyramids/ 

    Args:
        mayo_path (Path, optional): a path relative to the project root that is the location 
            of the strip ai data. Defaults to data/mayo-clinic-strip-ai.

    """
    def convert(in_path, out_path):
        print(f"Converting {in_path}")
        os.system(f"vips tiffsave {in_path} {out_path} --compression=lzw --tile --tile-width=256 --tile-height=256 --pyramid")

    data_root = Path(data_root)
    out_root = Path(out_root)

    if project_root is not None:
        data_root = project_root / data_root
        out_root = project_root / out_root

    # train images
    for img_path in list((data_root / "train" / "train").glob("*.tif")):
        output_path = out_root / "train" / img_path.name
        if not output_path.exists():
            convert(img_path, output_path)

    # test images
    for img_path in list((data_root / "test" / "test").glob("*.tif")):
        output_path = out_root / "test" / img_path.name
        if not output_path.exists():
            convert(img_path, output_path)


[docs]def training(data_root: Path = Path("data", "mayo-clinic-strip-ai"), project_root: Path = None) -> pd.DataFrame:
    """ Create Strip AI training dataset
    
    This function goes through the input directories for the training slides, 
    and matches up the slide paths with infomation in the csv
    It creates a dataframe with slide path with matching slide label stored for both label and annotation.
    The tags column stores the patient id and center id. 

    Args:
        mayo_path (Path, optional): a path relative to the project root that is the location 
            of the stripai data. Defaults to data/mayo-clinic-strip-ai.
    Returns:
        df (pd.DataFrame): A dataframe with columns slide, annotation, label and tags
    """
    # set up the paths to the slides and annotations
    if project_root is not None:
        data_root = project_root / data_root

    dataset_root = data_root / "train" / "train"
    labels_df = pd.read_csv(data_root / "train.csv")

    # turn them into a data frame and pad with empty annotation paths
    slidepaths = []
    annots = []
    labels = []
    tags = []
    imagelist =  list(dataset_root.glob("*.tif"))
    for sp in imagelist[0:10]:
        slidepaths.append(sp)
        imageid = sp.stem
        row = labels_df[labels_df.image_id == imageid]
        annot = row.label.iloc[0]
        label = row.label.iloc[0]
        tag = 'center' + str(row.center_id.iloc[0]) + "; patient " + str(row.patient_id.iloc[0])
        annots.append(annot)
        labels.append(label)
        tags.append(tag)

    df = pd.DataFrame()
    df["slide"] = slidepaths
    df["annotation"] = annots
    df["label"] = labels
    df["tags"] = tags

    return df

# convert_to_pyramids()