Working with datasets in ClearML
In the instructions, we look at examples of working with datasets in ClearML using the CIDFAR10 image dataset as an example.
ClearML uses the Dataset class to work with datasets. In ClearML WebApp, a dataset is displayed as an Experiment with the Data Processing type.
Before you start working with datasets, you need to prepare the environment.
Prepare the environment
-
Install a system to manage conda packages.
-
Create an environment for conda:
conda env create --file environment.yml
File environment.yml
name: clearml_datasets
channels:
- defaults
dependencies:
- ca-certificates=2022.4.26
- certifi=2022.5.18.1
- libffi=3.3
- ncurses=6.3
- openssl=1.1.1o
- pip=21.2.4
- python=3.8.13
- readline=8.1.2
- setuptools=61.2.0
- sqlite=3.38.3
- tk=8.6.12
- wheel=0.37.1
- xz=5.2.5
- zlib=1.2.12
- pip:
- attrs==21.4.0
- boto3==1.24.22
- botocore==1.27.22
- charset-normalizer==2.0.12
- clearml==1.5.0
- furl==2.1.3
- future==0.18.2
- idna==3.3
- importlib-resources==5.8.0
- jsonschema==4.6.0
- numpy==1.22.4
- orderedmultidict==1.0.1
- pathlib2==2.3.7.post1
- pillow==9.1.1
- psutil==5.9.1
- pyjwt==2.4.0
- pyparsing==3.0.9
- pyrsistent==0.18.1
- python-dateutil==2.8.2
- pyyaml==6.0
- python-mnist==0.7
- requests==2.28.0
- tqdm==4.64.0
- six==1.16.0
- urllib3==1.26.9
- zipp==3.8.0 -
Activate the environment:
conda activate clearml_datasets
-
Check the connection of the ClearML SDK to the ClearML Server:
clearml-init
-
Make sure that the
clearml.conf
configuration file contains a valid ClearML Server URL of the formhttp://yourdomain.cmlp.selectel.ru.
See the ClearML documentation for more information. -
Check that the clearml.conf configuration file describes the connection to the storage. We recommend to connect ClearML to Selectel object storage.
Prepare data
Before using the examples, load the CIDFAR10 dataset and prepare it for use.
Example of a script for data preparation
import os
import numpy as np
import tqdm
import shutil
import requests
from typing import Dict, Tuple, List, Text
from PIL import Image
from source.auxiliary_code.global_config import get_temp_data_path
def unpickle(file: Text) -> Dict:
import pickle
with open(file, 'rb') as fo:
dict = pickle.load(fo, encoding='bytes')
return dict
def prepare_temp_folder(dataset_name: Text) -> Text:
temp_folder_path = get_temp_data_path()
if not os.path.exists(temp_folder_path):
os.mkdir(temp_folder_path)
if not os.path.exists(os.path.join(temp_folder_path, dataset_name)):
os.mkdir(os.path.join(temp_folder_path, dataset_name))
return temp_folder_path
def get_data_archive(temp_folder: Text) -> Text:
archive_name = "cifar-10-python.tar.gz"
archive_url = "https://www.cs.toronto.edu/~kriz/{}".format(archive_name)
archive_path = os.path.join(temp_folder, archive_name)
print("Downloading data archive from {}".format(archive_url))
if not os.path.exists(archive_path):
r = requests.get(archive_url)
open(os.path.join(temp_folder, archive_name), 'wb').write(r.content)
return archive_path
def unzip_data(archive_path: Text) -> Text:
data_folder = archive_path.split("/")[-1].split(".")[0]
extract_dir = "{}/{}".format("/".join(archive_path.split("/")[:-1]), data_folder)
print("Extracting data archive to {}".format(extract_dir))
shutil.unpack_archive(archive_path, extract_dir)
return extract_dir
def transform_images(batches: List[Text]) -> List[Dict]:
images = []
for batch_path in batches:
batch = unpickle(batch_path)
for i in tqdm.tqdm(range(len(batch[b'data']))):
images.append({
"image": Image.fromarray(np.reshape(batch[b'data'][i], (32, 32, 3), order='F')),
"label": str(batch[b'labels'][i]),
"file_name": batch[b'filenames'][i].decode('utf-8')
})
return images
def save_images(images: List[Dict], folder: Text) -> List[Text]:
image_paths = []
for i in tqdm.tqdm(range(len(images))):
if not os.path.exists(os.path.join(folder, images[i]["label"])):
os.mkdir(os.path.join(folder, images[i]["label"]))
images[i]["image"].save(os.path.join(folder, images[i]["label"], images[i]["file_name"]))
image_paths.append(os.path.join(folder, images[i]["label"], images[i]["file_name"]))
return image_paths
def extract(dataset_name: Text) -> Text:
temp_folder_path = prepare_temp_folder(dataset_name)
archive_path = get_data_archive(os.path.join(temp_folder_path, dataset_name))
data_path = unzip_data(archive_path)
return data_path
def transform(data_path: Text) -> Tuple[List, List]:
test_batches = [
os.path.join(data_path, "cifar-10-batches-py", "test_batch")
]
train_batches = [
os.path.join(data_path, "cifar-10-batches-py", "data_batch_1"),
os.path.join(data_path, "cifar-10-batches-py", "data_batch_2"),
os.path.join(data_path, "cifar-10-batches-py", "data_batch_3"),
os.path.join(data_path, "cifar-10-batches-py", "data_batch_4"),
os.path.join(data_path, "cifar-10-batches-py", "data_batch_5")
]
print("Extracting train images from pickle batches")
train_images = transform_images(train_batches)
print("Extracting test images from pickle batches")
test_images = transform_images(test_batches)
return test_images, train_images
def load(images: Tuple[List, List], dataset_name: Text) -> Tuple[List, List]:
test_images, train_images = images
temp_folder_path = get_temp_data_path()
dataset_folder = os.path.join(temp_folder_path, dataset_name)
dataset_train_folder = os.path.join(dataset_folder, "train")
dataset_test_folder = os.path.join(dataset_folder, "test")
if not os.path.exists(dataset_train_folder):
os.mkdir(dataset_train_folder)
if not os.path.exists(dataset_test_folder):
os.mkdir(dataset_test_folder)
print("Saving train images to {}".format(dataset_train_folder))
train_image_paths = save_images(train_images, dataset_train_folder)
print("Saving test images to {}".format(dataset_test_folder))
test_image_paths = save_images(test_images, dataset_test_folder)
return train_image_paths, test_image_paths
if __name__ == "__main__":
data_path = extract("CIFAR10")
images = transform(data_path)
res = load(images, "CIFAR10")
Create a dataset
To work with datasets in ClearML, the Dataset class is used.
An example script for creating a Datset:
from clearml import Dataset
from source.auxiliary_code import global_config
if __name__ == "__main__":
dataset_name = 'CIFAR10'
cifar10_dataset = Dataset.create(dataset_project=global_config.DATASET_PROJECT, dataset_name=dataset_name)
for dataset in Dataset.list_datasets(dataset_project=global_config.DATASET_PROJECT, only_completed=False):
print(dataset)
Load data into a dataset
You can load new data into an existing dataset.
In the example, File Server is used for data storage. You can configure ClearML Server to work with any storage, for example, connect Selectel object storage.
An example script to load data into a dataset:
import os
from clearml import Dataset
from source.auxiliary_code import global_config
if __name__ == "__main__":
dataset_name = 'CIFAR10'
cifar10_dataset = Dataset.get(dataset_project=global_config.DATASET_PROJECT, dataset_name=dataset_name)
data_path = os.path.join(global_config.get_temp_data_path(), dataset_name)
cifar10_dataset.add_files(
path=os.path.join(data_path, 'train'),
dataset_path=os.path.join(dataset_name, 'train'),
verbose=True
)
Dataset.upload(cifar10_dataset, verbose=True)
Add metadata for a dataset
You can add metadata for a dataset. In the example, Tags are added — you can use them to filter datasets.
Sample script for adding tags:
from clearml import Dataset
from source.auxiliary_code import global_config
if __name__ == "__main__":
dataset_name = 'CIFAR10'
cifar10_dataset = Dataset.get(dataset_project=global_config.DATASET_PROJECT, dataset_name=dataset_name)
cifar10_dataset.add_tags(['image', 'classification', 'example', 'small'])
for dataset in Dataset.list_datasets(dataset_project=global_config.DATASET_PROJECT, only_completed=False):
print(dataset)