Working with datasets in ClearML

For your information

In this instruction, we look at examples of working with datasets in ClearML using the CIFAR10 image dataset as an example.

To work with datasets in ClearML, the Dataset class is used. In the ClearML WebApp, a dataset is displayed as an experiment (Experiment) with the Data Processing type.

Before you start working with datasets, you need to prepare the environment.

Prepare the environment

Install the conda package management system.

Create a conda environment:

conda env create --file environment.yml

environment.yml file

name: clearml_datasets
channels:
  - defaults
dependencies:
  - ca-certificates=2022.4.26
  - certifi=2022.5.18.1
  - libffi=3.3
  - ncurses=6.3
  - openssl=1.1.1o
  - pip=21.2.4
  - python=3.8.13
  - readline=8.1.2
  - setuptools=61.2.0
  - sqlite=3.38.3
  - tk=8.6.12
  - wheel=0.37.1
  - xz=5.2.5
  - zlib=1.2.12
  - pip:
    - attrs==21.4.0
    - boto3==1.24.22
    - botocore==1.27.22
    - charset-normalizer==2.0.12
    - clearml==1.5.0
    - furl==2.1.3
    - future==0.18.2
    - idna==3.3
    - importlib-resources==5.8.0
    - jsonschema==4.6.0
    - numpy==1.22.4
    - orderedmultidict==1.0.1
    - pathlib2==2.3.7.post1
    - pillow==9.1.1
    - psutil==5.9.1
    - pyjwt==2.4.0
    - pyparsing==3.0.9
    - pyrsistent==0.18.1
    - python-dateutil==2.8.2
    - pyyaml==6.0
    - python-mnist==0.7
    - requests==2.28.0
    - tqdm==4.64.0
    - six==1.16.0
    - urllib3==1.26.9
    - zipp==3.8.0

Activate the environment:
```
conda activate clearml_datasets
```
Check the ClearML SDK connection to the ClearML Server:
```
clearml-init
```
Make sure the configuration file clearml.conf specifies the correct ClearML Server URL in the form of http://yourdomain.cmlp.selectel.ru. Read more in the ClearML documentation.
Check that the clearml.conf configuration file describes the storage connection. We recommend connecting ClearML to Selectel S3.

Prepare data

Before using the examples, download the CIDFAR10 dataset and prepare it for use.

Sample script for data preparation

import os
import numpy as np
import tqdm
import shutil
import requests
from typing import Dict, Tuple, List, Text
from PIL import Image

from source.auxiliary_code.global_config import get_temp_data_path

def unpickle(file: Text) -> Dict:
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

def prepare_temp_folder(dataset_name: Text) -> Text:
    temp_folder_path = get_temp_data_path()

    if not os.path.exists(temp_folder_path):
        os.mkdir(temp_folder_path)

    if not os.path.exists(os.path.join(temp_folder_path, dataset_name)):
        os.mkdir(os.path.join(temp_folder_path, dataset_name))

    return temp_folder_path


def get_data_archive(temp_folder: Text) -> Text:
    archive_name = "cifar-10-python.tar.gz"
    archive_url = "https://www.cs.toronto.edu/~kriz/{}".format(archive_name)

    archive_path = os.path.join(temp_folder, archive_name)

    print("Downloading data archive from {}".format(archive_url))

    if not os.path.exists(archive_path):
        r = requests.get(archive_url)

        open(os.path.join(temp_folder, archive_name), 'wb').write(r.content)

    return archive_path


def unzip_data(archive_path: Text) -> Text:
    data_folder = archive_path.split("/")[-1].split(".")[0]

    extract_dir = "{}/{}".format("/".join(archive_path.split("/")[:-1]), data_folder)

    print("Extracting data archive to {}".format(extract_dir))

    shutil.unpack_archive(archive_path, extract_dir)

    return extract_dir


def transform_images(batches: List[Text]) -> List[Dict]:
    images = []

    for batch_path in batches:
        batch = unpickle(batch_path)

        for i in tqdm.tqdm(range(len(batch[b'data']))):
            images.append({
                "image": Image.fromarray(np.reshape(batch[b'data'][i], (32, 32, 3), order='F')),
                "label": str(batch[b'labels'][i]),
                "file_name": batch[b'filenames'][i].decode('utf-8')
            })

    return images


def save_images(images: List[Dict], folder: Text) -> List[Text]:
    image_paths = []
    for i in tqdm.tqdm(range(len(images))):
        if not os.path.exists(os.path.join(folder, images[i]["label"])):
            os.mkdir(os.path.join(folder, images[i]["label"]))

        images[i]["image"].save(os.path.join(folder, images[i]["label"], images[i]["file_name"]))
        image_paths.append(os.path.join(folder, images[i]["label"], images[i]["file_name"]))

    return image_paths


def extract(dataset_name: Text) -> Text:
    temp_folder_path = prepare_temp_folder(dataset_name)

    archive_path = get_data_archive(os.path.join(temp_folder_path, dataset_name))

    data_path = unzip_data(archive_path)

    return data_path


def transform(data_path: Text) -> Tuple[List, List]:
    test_batches = [
        os.path.join(data_path, "cifar-10-batches-py", "test_batch")
    ]
    train_batches = [
        os.path.join(data_path, "cifar-10-batches-py", "data_batch_1"),
        os.path.join(data_path, "cifar-10-batches-py", "data_batch_2"),
        os.path.join(data_path, "cifar-10-batches-py", "data_batch_3"),
        os.path.join(data_path, "cifar-10-batches-py", "data_batch_4"),
        os.path.join(data_path, "cifar-10-batches-py", "data_batch_5")
    ]

    print("Extracting train images from pickle batches")
    train_images = transform_images(train_batches)
    print("Extracting test images from pickle batches")
    test_images = transform_images(test_batches)

    return test_images, train_images


def load(images: Tuple[List, List], dataset_name: Text) -> Tuple[List, List]:
    test_images, train_images = images

    temp_folder_path = get_temp_data_path()

    dataset_folder = os.path.join(temp_folder_path, dataset_name)

    dataset_train_folder = os.path.join(dataset_folder, "train")
    dataset_test_folder = os.path.join(dataset_folder, "test")

    if not os.path.exists(dataset_train_folder):
        os.mkdir(dataset_train_folder)
    if not os.path.exists(dataset_test_folder):
        os.mkdir(dataset_test_folder)

    print("Saving train images to {}".format(dataset_train_folder))
    train_image_paths = save_images(train_images, dataset_train_folder)
    print("Saving test images to {}".format(dataset_test_folder))
    test_image_paths = save_images(test_images, dataset_test_folder)

    return train_image_paths, test_image_paths


if __name__ == "__main__":
    data_path = extract("CIFAR10")

    images = transform(data_path)

    res = load(images, "CIFAR10")

Create a dataset

To work with datasets in ClearML, use the Dataset class.

Sample script for creating a Dataset:

from clearml import Dataset

from source.auxiliary_code import global_config

if __name__ == "__main__":
    dataset_name = 'CIFAR10'

    cifar10_dataset = Dataset.create(dataset_project=global_config.DATASET_PROJECT, dataset_name=dataset_name)

    for dataset in Dataset.list_datasets(dataset_project=global_config.DATASET_PROJECT, only_completed=False):
        print(dataset)

Load data into a dataset

You can load new data into an existing dataset.

In the example, a File Server is used to store data. You can configure the ClearML Server to work with any storage, for example, connect Selectel S3.

Sample script for loading data into a dataset:

import os

from clearml import Dataset

from source.auxiliary_code import global_config

if __name__ == "__main__":
    dataset_name = 'CIFAR10'

    cifar10_dataset = Dataset.get(dataset_project=global_config.DATASET_PROJECT, dataset_name=dataset_name)

    data_path = os.path.join(global_config.get_temp_data_path(), dataset_name)

    cifar10_dataset.add_files(
        path=os.path.join(data_path, 'train'),
        dataset_path=os.path.join(dataset_name, 'train'),
        verbose=True
    )

    Dataset.upload(cifar10_dataset, verbose=True)

Add metadata for a dataset

You can add metadata for a dataset. In the example, tags (Tags) are added—they can be used to filter datasets.

Sample script for adding tags:

from clearml import Dataset

from source.auxiliary_code import global_config

if __name__ == "__main__":
    dataset_name = 'CIFAR10'

    cifar10_dataset = Dataset.get(dataset_project=global_config.DATASET_PROJECT, dataset_name=dataset_name)

    cifar10_dataset.add_tags(['image', 'classification', 'example', 'small'])

    for dataset in Dataset.list_datasets(dataset_project=global_config.DATASET_PROJECT, only_completed=False):
        print(dataset)