Semantic Segmentation on Oxford-IIIT Pet Dataset

Semantic Segmentation with Amazon SageMaker

Computer Vision
Semantic Segmentation
SageMaker
ResNet-50
FCN
Python
Author

Kobus Esterhuysen

Published

April 8, 2021

Back to Blog |  LearnableLoopAI.com |  Portfolio of Projects |  LinkedIn


1. Introduction

There are 4 prominent computer vision tasks: 1. Image Classification 2. Object Detection 3. Semantic Segmentation 4. Instance Segmentation

This post shows an example of Semantic Segmentation. We will use the Oxford-IIIT Pet Dataset.

The dataset consists of a 37 category dog and cat pet dataset with roughly 200 images for each class. The images have large variations in scale, pose and lighting. All images have an associated ground truth annotation of breed, head ROI, and pixel level trimap segmentation. We will make use of the trimap segmentation pixel labels.

To perform the training and deployment of our semantic segmentation model we will make use of Amazon SageMaker. SageMaker provides a seamless pipeline.

2. Platform Setup

This notebook should be run on a ‘notebook instance’ which could be setup within the SageMaker service.

!pip3 install tqdm
!pip3 install pillow --upgrade
Requirement already satisfied: tqdm in /home/ec2-user/anaconda3/envs/amazonei_mxnet_p36/lib/python3.6/site-packages (4.60.0)
Requirement already satisfied: pillow in /home/ec2-user/anaconda3/envs/amazonei_mxnet_p36/lib/python3.6/site-packages (8.2.0)
%matplotlib inline
import os
import io
import tarfile
import urllib
import shutil
import json
import random
import numpy as np
from tqdm import tqdm
from PIL import Image, ImageDraw, ImageFont
from matplotlib import pyplot as plt
from xml.etree import ElementTree as ET
import boto3
import sagemaker
from sagemaker.amazon.amazon_estimator import get_image_uri
sagemaker.__version__
'2.32.0'

3. Get Data

#
# setup folders on the notebook instance/VM
if not os.path.isdir('data'):
    os.mkdir('data')
urls = ['http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz',
        'http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz']
def download_and_extract(data_dir, download_dir):
    for url in urls:
        target_file = url.split('/')[-1]
        if target_file not in os.listdir(download_dir):
            urllib.request.urlretrieve(url, os.path.join(download_dir, target_file))
            tf = tarfile.open(url.split('/')[-1])
            tf.extractall(data_dir)
            print(f'{url} downloaded and extracted ...', )
        else:
            print('Already downloaded', url)
download_and_extract('data', '.')
http://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz downloaded and extracted ...
http://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz downloaded and extracted ...
# 
# remove the tars
!rm *.tar.gz

4. Inspect Data

We will use the following convention for naming data objects: * x: train data * y: label data * yh: predicted data (‘h’ is for ‘hat’)

y_fold = 'data/annotations/trimaps/'
ys = [img for img in os.listdir(y_fold) if img[-3:] == 'png']
print(len(ys))
14780
x_fold = 'data/images/'
xs = [img for img in os.listdir(x_fold) if img[-3:] == 'jpg']
print(len(xs))
7390
xs[7]
'scottish_terrier_88.jpg'
num_examples = 5
plt.figure(figsize=(15, 15))
for i in range(0, num_examples):
    ix = random.randint(0, len(xs) - 1)
    x_name = xs[ix]
    y_name = xs[ix].split('.')[0] + '.png'

    plt.subplot(num_examples, 2, 1 + i*2)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(plt.imread(os.path.join(x_fold, x_name)))
    
    plt.subplot(num_examples, 2, 2 + i*2)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(plt.imread(os.path.join(y_fold, y_name)))

ys[10]
'english_cocker_spaniel_173.png'
# 
# unique values in a label map
img = Image.open(os.path.join(y_fold, ys[10]))
print(np.unique(img))
[1 2 3]

5. Setup SageMaker

%%time
role = sagemaker.get_execution_role()
print(role)
ses = sagemaker.Session()
arn:aws:iam::005868863755:role/service-role/AmazonSageMaker-ExecutionRole-20210401T120740
CPU times: user 211 ms, sys: 25 ms, total: 236 ms
Wall time: 280 ms
buck = 'oxford-iiit-pet'
training_image = sagemaker.image_uris.retrieve( #docker image
    'semantic-segmentation', 
    ses.boto_region_name)
print(training_image)
433757028032.dkr.ecr.us-west-2.amazonaws.com/semantic-segmentation:1

6. Split Data into train/valid

folders = ['train', 'train_annotation', 'validation', 'validation_annotation']
for folder in folders:
    if os.path.isdir(folder):
        shutil.rmtree(folder)
    os.mkdir(folder)
def get_map_file(image):
    map_file = image.split('.')[0] + '.png'
    assert map_file in ys
    return map_file
%%time
# split into train/valid
for image in tqdm(xs):
    target_set = 'train' if random.randint(0, 99) < 75 else 'validation'
    
    image_file_path = os.path.join('data/images/', image)
    image_target_path = os.path.join(target_set, image)
    
    map_file_path = os.path.join(y_fold, get_map_file(image))
    map_target_path = os.path.join(target_set + '_annotation', get_map_file(image))
    
    shutil.copy(image_file_path, image_target_path)
    shutil.copy(map_file_path, map_target_path)
100%|██████████| 7390/7390 [00:12<00:00, 598.10it/s] 
CPU times: user 2.45 s, sys: 1.61 s, total: 4.06 s
Wall time: 12.4 s
xs_trn = os.listdir('train')
ys_trn = os.listdir('train_annotation')
print(len(ys_trn), len(xs_trn))
5548 5548

7. Position Data on S3

buck
'oxford-iiit-pet'
%%time
ses = sagemaker.Session()

print('uploading data to S3 ...')
s3_xs_trn_path = ses.upload_data(path='train', bucket=buck, key_prefix='train')
print('xs_trn uploaded')
s3_ys_trn_path = ses.upload_data(path='train_annotation', bucket=buck, key_prefix='train_annotation')
print('ys_trn uploaded')
s3_xs_val_path = ses.upload_data(path='validation', bucket=buck, key_prefix='validation')
print('xs_val uploaded')
s3_ys_val_path = ses.upload_data(path='validation_annotation', bucket=buck, key_prefix='validation_annotation')
print('ys_val uploaded')
uploading data to S3 ...
xs_trn uploaded
ys_trn uploaded
xs_val uploaded
ys_val uploaded
CPU times: user 1min 43s, sys: 7.06 s, total: 1min 50s
Wall time: 13min 36s
s3_xs_trn_path
's3://oxford-iiit-pet/train'
s3_ys_trn_path
's3://oxford-iiit-pet/train_annotation'

8. SageMaker Model

model = sagemaker.estimator.Estimator(
    image_uri=training_image,
    role=role,
    instance_count=1,
    instance_type='ml.p3.2xlarge',
    volume_size=100,
    max_run=36000,
    input_mode='File',
    #output_path='s3://oxford-iiit-pet/model-output', #.
    output_path=f's3://{buck}/model-output',
    sagemaker_session=ses
)
model.set_hyperparameters(
    backbone='resnet-50', #encoder
    algorithm='fcn', #decoder; other options'psp', 'deeplab'                             
    use_pretrained_model='True', #pre-trained on ImageNet
    crop_size=240, #size of image random crop
    num_classes=4,
    epochs=10, 
    learning_rate=0.0001,                             
    optimizer='rmsprop', #other options: 'adam', 'rmsprop', 'nag', 'adagrad'
    lr_scheduler='poly', #other options: 'cosine', step'.                           
    mini_batch_size=16, 
    validation_mini_batch_size=16,
    early_stopping=True, 
    early_stopping_patience=3, 
    early_stopping_min_epochs=10, 
    num_training_samples=len(xs_trn)
)

9. Train Model

distribution = 'FullyReplicated'
data_channels = {
    'train': sagemaker.inputs.TrainingInput(s3_xs_trn_path, distribution=distribution),
    'validation': sagemaker.inputs.TrainingInput(s3_xs_val_path, distribution=distribution),
    'train_annotation': sagemaker.inputs.TrainingInput(s3_ys_trn_path, distribution=distribution),
    'validation_annotation':sagemaker.inputs.TrainingInput(s3_ys_val_path, distribution=distribution),
}
data_channels
{'train': <sagemaker.inputs.TrainingInput at 0x7f62ca0996a0>,
 'validation': <sagemaker.inputs.TrainingInput at 0x7f62ca099710>,
 'train_annotation': <sagemaker.inputs.TrainingInput at 0x7f62ca099748>,
 'validation_annotation': <sagemaker.inputs.TrainingInput at 0x7f62ca099860>}
%%time
model.fit(inputs=data_channels, logs=True)
2021-04-08 14:05:21 Starting - Starting the training job...
2021-04-08 14:05:44 Starting - Launching requested ML instancesProfilerReport-1617890720: InProgress
......
2021-04-08 14:06:44 Starting - Preparing the instances for training......
2021-04-08 14:07:44 Downloading - Downloading input data...............
2021-04-08 14:10:14 Training - Training image download completed. Training in progress..Docker entrypoint called with argument(s): train
Running default environment configuration script
Running custom environment configuration script
[04/08/2021 14:10:20 INFO 140384837293888] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/default-input.json: {'backbone': 'resnet-50', 'algorithm': 'fcn', 'use_pretrained_model': 'True', 'crop_size': '240', 'epochs': '10', 'learning_rate': '0.001', 'optimizer': 'sgd', 'lr_scheduler': 'poly', 'lr_scheduler_factor': '0.1', 'weight_decay': '0.0001', 'momentum': '0.9', 'gamma1': '0.9', 'gamma2': '0.9', 'mini_batch_size': '16', 'validation_mini_batch_size': '16', 'num_training_samples': '8', 'early_stopping_min_epochs': '5', 'early_stopping': 'False', 'early_stopping_patience': '4', 'early_stopping_tolerance': '0.0', 'precision_dtype': 'float32', '_kvstore': 'device', '_num_kv_servers': 'auto', 'syncbn': 'False', '_workers': '16', '_aux_loss': 'True', '_aux_weight': '0.5', '_hybrid': 'False', '_augmentation_type': 'default', '_logging_frequency': '20'}
[04/08/2021 14:10:20 INFO 140384837293888] Merging with provided configuration from /opt/ml/input/config/hyperparameters.json: {'num_classes': '4', 'num_training_samples': '5548', 'early_stopping': 'True', 'early_stopping_patience': '3', 'crop_size': '240', 'lr_scheduler': 'poly', 'optimizer': 'rmsprop', 'use_pretrained_model': 'True', 'backbone': 'resnet-50', 'validation_mini_batch_size': '16', 'epochs': '10', 'learning_rate': '0.0001', 'early_stopping_min_epochs': '10', 'algorithm': 'fcn', 'mini_batch_size': '16'}
[04/08/2021 14:10:20 INFO 140384837293888] Final configuration: {'backbone': 'resnet-50', 'algorithm': 'fcn', 'use_pretrained_model': 'True', 'crop_size': '240', 'epochs': '10', 'learning_rate': '0.0001', 'optimizer': 'rmsprop', 'lr_scheduler': 'poly', 'lr_scheduler_factor': '0.1', 'weight_decay': '0.0001', 'momentum': '0.9', 'gamma1': '0.9', 'gamma2': '0.9', 'mini_batch_size': '16', 'validation_mini_batch_size': '16', 'num_training_samples': '5548', 'early_stopping_min_epochs': '10', 'early_stopping': 'True', 'early_stopping_patience': '3', 'early_stopping_tolerance': '0.0', 'precision_dtype': 'float32', '_kvstore': 'device', '_num_kv_servers': 'auto', 'syncbn': 'False', '_workers': '16', '_aux_loss': 'True', '_aux_weight': '0.5', '_hybrid': 'False', '_augmentation_type': 'default', '_logging_frequency': '20', 'num_classes': '4'}
Process 1 is a worker.
[04/08/2021 14:10:20 INFO 140384837293888] Using default worker.
[04/08/2021 14:10:20 INFO 140384837293888] Loaded iterator creator application/json for content type ('application/json', '1.0')
[04/08/2021 14:10:20 INFO 140384837293888] font search path ['/opt/amazon/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/ttf', '/opt/amazon/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/afm', '/opt/amazon/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/pdfcorefonts']
[04/08/2021 14:10:20 INFO 140384837293888] generated new fontManager
[04/08/2021 14:10:21 INFO 140384837293888] Loaded iterator creator application/x-image for content type ('application/x-image', '1.0')
[04/08/2021 14:10:21 INFO 140384837293888] Loaded iterator creator application/x-recordio for content type ('application/x-recordio', '1.0')
[04/08/2021 14:10:21 INFO 140384837293888] Loaded iterator creator image/jpeg for content type ('image/jpeg', '1.0')
[04/08/2021 14:10:21 INFO 140384837293888] Loaded iterator creator image/png for content type ('image/png', '1.0')
[04/08/2021 14:10:21 INFO 140384837293888] Checkpoint loading and saving are disabled.
[04/08/2021 14:10:21 WARNING 140384837293888] /opt/ml/input/data/train/train_annotation is not a readable image file
[04/08/2021 14:10:26 WARNING 140384837293888] label maps not provided, using defaults.
[04/08/2021 14:10:26 INFO 140384837293888] #label_map train :{'scale': 1}
[04/08/2021 14:10:27 WARNING 140384837293888] /opt/ml/input/data/validation/validation_annotation is not a readable image file
[04/08/2021 14:10:28 WARNING 140384837293888] label maps not provided, using defaults.
[04/08/2021 14:10:28 INFO 140384837293888] #label_map validation :{'scale': 1}
[04/08/2021 14:10:28 INFO 140384837293888] nvidia-smi: took 0.032 seconds to run.
[04/08/2021 14:10:28 INFO 140384837293888] nvidia-smi identified 1 GPUs.
[04/08/2021 14:10:28 INFO 140384837293888] Number of GPUs being used: 1
[04/08/2021 14:10:28 INFO 140384837293888] Number of GPUs being used: 1
[04/08/2021 14:10:28 INFO 140384837293888] Number of GPUs being used: 1
[14:10:31] /opt/brazil-pkg-cache/packages/AIAlgorithmsMXNet/AIAlgorithmsMXNet-1.4.x.3313.0/AL2_x86_64/generic-flavor/src/src/storage/storage.cc:108: Using GPUPooledRoundedStorageManager.
[04/08/2021 14:10:35 INFO 140384837293888] LRScheduler setup: iters per epoch: 346, num_epochs 10
#metrics {"StartTime": 1617891035.7269874, "EndTime": 1617891035.7271817, "Dimensions": {"Algorithm": "AWS/Semantic Segmentation", "Host": "algo-1", "Operation": "training", "Meta": "init_train_data_iter"}, "Metrics": {"Total Records Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Total Batches Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Records Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Batches Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Reset Count": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Number of Records Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Number of Batches Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}}}

[04/08/2021 14:10:43 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 20 speed: 57.23477743217807 samples/sec learning_rate: 0.000099
[04/08/2021 14:10:48 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 40 speed: 57.12151060055633 samples/sec learning_rate: 0.000099
[04/08/2021 14:10:54 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 60 speed: 57.01716746219607 samples/sec learning_rate: 0.000098
[04/08/2021 14:11:00 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 80 speed: 56.669445439007745 samples/sec learning_rate: 0.000098
[04/08/2021 14:11:05 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 100 speed: 57.15989796048737 samples/sec learning_rate: 0.000097
[04/08/2021 14:11:11 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 120 speed: 56.73896674318229 samples/sec learning_rate: 0.000097
[04/08/2021 14:11:17 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 140 speed: 57.07696036864635 samples/sec learning_rate: 0.000096
[04/08/2021 14:11:23 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 160 speed: 57.253283737437506 samples/sec learning_rate: 0.000096
[04/08/2021 14:11:28 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 180 speed: 57.160287450161704 samples/sec learning_rate: 0.000095
[04/08/2021 14:11:34 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 200 speed: 57.30681912309092 samples/sec learning_rate: 0.000095
[04/08/2021 14:11:40 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 220 speed: 56.50268794344752 samples/sec learning_rate: 0.000094
[04/08/2021 14:11:46 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 240 speed: 56.42174009912436 samples/sec learning_rate: 0.000094
[04/08/2021 14:11:51 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 260 speed: 56.426531597206456 samples/sec learning_rate: 0.000093
[04/08/2021 14:11:57 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 280 speed: 56.70646072518913 samples/sec learning_rate: 0.000093
[04/08/2021 14:12:03 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 300 speed: 56.45411038475419 samples/sec learning_rate: 0.000092
[04/08/2021 14:12:09 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 320 speed: 57.32169967567549 samples/sec learning_rate: 0.000092
[04/08/2021 14:12:14 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 340 speed: 57.21111266457744 samples/sec learning_rate: 0.000091
[04/08/2021 14:12:16 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 0, train loss: 0.583177892631189 .
[04/08/2021 14:12:16 INFO 140384837293888] #throughput_metric. host: algo-1, epoch: 0, train throughput: 56.491655280649155 samples/sec.
[04/08/2021 14:12:21 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 20 speed: 78.7017959465133 samples/sec
[04/08/2021 14:12:25 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 40 speed: 90.27349390297218 samples/sec
[04/08/2021 14:12:29 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 60 speed: 77.33342552196048 samples/sec
[04/08/2021 14:12:33 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 80 speed: 85.69685453635314 samples/sec
[04/08/2021 14:12:36 INFO 140384837293888] #progress_notice. epoch: 0, iterations: 100 speed: 88.21083680127448 samples/sec
[04/08/2021 14:12:39 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 0, validation pixel_accuracy: 0.8907448765851449 .
[04/08/2021 14:12:39 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 0, validation mIOU: 0.5528668904414418 .
[04/08/2021 14:12:39 INFO 140384837293888] #throughput_metric. host: algo-1, epoch: 0, validation throughput: 84.9662831404142 samples/sec.
[04/08/2021 14:12:39 INFO 140384837293888] Serializing model to /opt/ml/model/model_best.params
[04/08/2021 14:12:39 INFO 140384837293888] Serializing model to /opt/ml/model/model_best.params
[04/08/2021 14:12:39 INFO 140384837293888] #progress_metric: host=algo-1, completed 10.0 % of epochs
#metrics {"StartTime": 1617891035.7279649, "EndTime": 1617891159.932177, "Dimensions": {"Algorithm": "AWS/Semantic Segmentation", "Host": "algo-1", "Operation": "training", "epoch": 0, "Meta": "training_data_iter"}, "Metrics": {"Total Records Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Total Batches Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Records Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Batches Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Reset Count": {"sum": 1.0, "count": 1, "min": 1, "max": 1}, "Number of Records Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Number of Batches Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}}}

[04/08/2021 14:12:46 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 20 speed: 56.415336892102054 samples/sec learning_rate: 0.000090
[04/08/2021 14:12:52 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 40 speed: 57.243711700399885 samples/sec learning_rate: 0.000090
[04/08/2021 14:12:58 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 60 speed: 56.579574857705566 samples/sec learning_rate: 0.000089
[04/08/2021 14:13:04 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 80 speed: 56.17581460913996 samples/sec learning_rate: 0.000089
[04/08/2021 14:13:09 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 100 speed: 56.344003976303384 samples/sec learning_rate: 0.000088
[04/08/2021 14:13:15 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 120 speed: 56.36397415158647 samples/sec learning_rate: 0.000088
[04/08/2021 14:13:21 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 140 speed: 56.739206601508336 samples/sec learning_rate: 0.000087
[04/08/2021 14:13:27 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 160 speed: 56.69596895059569 samples/sec learning_rate: 0.000087
[04/08/2021 14:13:33 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 180 speed: 56.354602496405874 samples/sec learning_rate: 0.000086
[04/08/2021 14:13:38 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 200 speed: 56.293620954902266 samples/sec learning_rate: 0.000086
[04/08/2021 14:13:44 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 220 speed: 56.68323073700428 samples/sec learning_rate: 0.000085
[04/08/2021 14:13:50 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 240 speed: 56.55654401725969 samples/sec learning_rate: 0.000085
[04/08/2021 14:13:56 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 260 speed: 56.20460800160803 samples/sec learning_rate: 0.000084
[04/08/2021 14:14:01 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 280 speed: 55.754374187888196 samples/sec learning_rate: 0.000084
[04/08/2021 14:14:07 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 300 speed: 55.93001920204822 samples/sec learning_rate: 0.000083
[04/08/2021 14:14:13 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 320 speed: 56.947929651865834 samples/sec learning_rate: 0.000082
[04/08/2021 14:14:19 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 340 speed: 56.959143366154784 samples/sec learning_rate: 0.000082
[04/08/2021 14:14:21 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 1, train loss: 0.447907111057287 .
[04/08/2021 14:14:21 INFO 140384837293888] #throughput_metric. host: algo-1, epoch: 1, train throughput: 56.361313717493665 samples/sec.
[04/08/2021 14:14:25 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 20 speed: 84.9008574992599 samples/sec
[04/08/2021 14:14:29 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 40 speed: 84.4207801160601 samples/sec
[04/08/2021 14:14:33 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 60 speed: 92.37830181277074 samples/sec
[04/08/2021 14:14:37 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 80 speed: 91.48692836742623 samples/sec
[04/08/2021 14:14:40 INFO 140384837293888] #progress_notice. epoch: 1, iterations: 100 speed: 85.63954978178198 samples/sec
[04/08/2021 14:14:43 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 1, validation pixel_accuracy: 0.9046230563103864 .
[04/08/2021 14:14:43 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 1, validation mIOU: 0.5752667065386995 .
[04/08/2021 14:14:43 INFO 140384837293888] #throughput_metric. host: algo-1, epoch: 1, validation throughput: 86.60699598953663 samples/sec.
[04/08/2021 14:14:43 INFO 140384837293888] Serializing model to /opt/ml/model/model_best.params
[04/08/2021 14:14:43 INFO 140384837293888] #progress_metric: host=algo-1, completed 20.0 % of epochs
#metrics {"StartTime": 1617891159.932389, "EndTime": 1617891283.7740269, "Dimensions": {"Algorithm": "AWS/Semantic Segmentation", "Host": "algo-1", "Operation": "training", "epoch": 1, "Meta": "training_data_iter"}, "Metrics": {"Total Records Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Total Batches Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Records Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Batches Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Reset Count": {"sum": 2.0, "count": 1, "min": 2, "max": 2}, "Number of Records Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Number of Batches Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}}}

[04/08/2021 14:14:50 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 20 speed: 56.667339942242414 samples/sec learning_rate: 0.000081
[04/08/2021 14:14:56 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 40 speed: 56.99281102202384 samples/sec learning_rate: 0.000081
[04/08/2021 14:15:02 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 60 speed: 56.10016108901861 samples/sec learning_rate: 0.000080
[04/08/2021 14:15:08 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 80 speed: 56.44580068432545 samples/sec learning_rate: 0.000080
[04/08/2021 14:15:13 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 100 speed: 56.948654542985885 samples/sec learning_rate: 0.000079
[04/08/2021 14:15:19 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 120 speed: 56.12606037898068 samples/sec learning_rate: 0.000079
[04/08/2021 14:15:25 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 140 speed: 56.81029391846133 samples/sec learning_rate: 0.000078
[04/08/2021 14:15:31 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 160 speed: 56.333740460834754 samples/sec learning_rate: 0.000078
[04/08/2021 14:15:36 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 180 speed: 56.73891877176043 samples/sec learning_rate: 0.000077
[04/08/2021 14:15:42 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 200 speed: 56.07597248880091 samples/sec learning_rate: 0.000076
[04/08/2021 14:15:48 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 220 speed: 57.40156973839974 samples/sec learning_rate: 0.000076
[04/08/2021 14:15:54 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 240 speed: 56.70871289069762 samples/sec learning_rate: 0.000075
[04/08/2021 14:15:59 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 260 speed: 56.62650512946434 samples/sec learning_rate: 0.000075
[04/08/2021 14:16:05 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 280 speed: 56.38116361736109 samples/sec learning_rate: 0.000074
[04/08/2021 14:16:11 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 300 speed: 56.56407585159412 samples/sec learning_rate: 0.000074
[04/08/2021 14:16:17 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 320 speed: 56.924839448198874 samples/sec learning_rate: 0.000073
[04/08/2021 14:16:23 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 340 speed: 56.99348865333937 samples/sec learning_rate: 0.000073
[04/08/2021 14:16:25 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 2, train loss: 0.40636187671584423 .
[04/08/2021 14:16:25 INFO 140384837293888] #throughput_metric. host: algo-1, epoch: 2, train throughput: 56.355349321004795 samples/sec.
[04/08/2021 14:16:29 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 20 speed: 91.71774346751147 samples/sec
[04/08/2021 14:16:33 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 40 speed: 92.70447754596277 samples/sec
[04/08/2021 14:16:37 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 60 speed: 93.21737197847533 samples/sec
[04/08/2021 14:16:41 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 80 speed: 82.28774344329334 samples/sec
[04/08/2021 14:16:44 INFO 140384837293888] #progress_notice. epoch: 2, iterations: 100 speed: 89.45773113970893 samples/sec
[04/08/2021 14:16:47 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 2, validation pixel_accuracy: 0.9120192387530194 .
[04/08/2021 14:16:47 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 2, validation mIOU: 0.5847800261647766 .
[04/08/2021 14:16:47 INFO 140384837293888] #throughput_metric. host: algo-1, epoch: 2, validation throughput: 86.43337199856738 samples/sec.
[04/08/2021 14:16:47 INFO 140384837293888] Serializing model to /opt/ml/model/model_best.params
[04/08/2021 14:16:47 INFO 140384837293888] #progress_metric: host=algo-1, completed 30.0 % of epochs
#metrics {"StartTime": 1617891283.7742198, "EndTime": 1617891407.656583, "Dimensions": {"Algorithm": "AWS/Semantic Segmentation", "Host": "algo-1", "Operation": "training", "epoch": 2, "Meta": "training_data_iter"}, "Metrics": {"Total Records Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Total Batches Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Records Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Batches Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Reset Count": {"sum": 3.0, "count": 1, "min": 3, "max": 3}, "Number of Records Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Number of Batches Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}}}

[04/08/2021 14:16:54 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 20 speed: 56.05442658717066 samples/sec learning_rate: 0.000072
[04/08/2021 14:17:00 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 40 speed: 57.00878804791504 samples/sec learning_rate: 0.000071
[04/08/2021 14:17:06 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 60 speed: 56.089001739281606 samples/sec learning_rate: 0.000071
[04/08/2021 14:17:12 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 80 speed: 55.82473183128351 samples/sec learning_rate: 0.000070
[04/08/2021 14:17:17 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 100 speed: 56.68639081244267 samples/sec learning_rate: 0.000070
[04/08/2021 14:17:23 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 120 speed: 56.965767331943475 samples/sec learning_rate: 0.000069
[04/08/2021 14:17:29 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 140 speed: 56.16702251154787 samples/sec learning_rate: 0.000069
[04/08/2021 14:17:35 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 160 speed: 56.337618400407656 samples/sec learning_rate: 0.000068
[04/08/2021 14:17:40 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 180 speed: 56.34594358593301 samples/sec learning_rate: 0.000068
[04/08/2021 14:17:46 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 200 speed: 56.360802586039654 samples/sec learning_rate: 0.000067
[04/08/2021 14:17:52 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 220 speed: 56.8610764896524 samples/sec learning_rate: 0.000067
[04/08/2021 14:17:58 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 240 speed: 56.60166965522244 samples/sec learning_rate: 0.000066
[04/08/2021 14:18:03 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 260 speed: 56.03589163371896 samples/sec learning_rate: 0.000065
[04/08/2021 14:18:09 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 280 speed: 56.28842708466137 samples/sec learning_rate: 0.000065
[04/08/2021 14:18:15 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 300 speed: 56.12535627784989 samples/sec learning_rate: 0.000064
[04/08/2021 14:18:21 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 320 speed: 56.82905605094463 samples/sec learning_rate: 0.000064
[04/08/2021 14:18:26 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 340 speed: 57.23111665248159 samples/sec learning_rate: 0.000063
[04/08/2021 14:18:28 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 3, train loss: 0.38977852644082106 .
[04/08/2021 14:18:28 INFO 140384837293888] #throughput_metric. host: algo-1, epoch: 3, train throughput: 56.27982671746528 samples/sec.
[04/08/2021 14:18:33 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 20 speed: 83.71634511028888 samples/sec
[04/08/2021 14:18:37 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 40 speed: 92.26615338081227 samples/sec
[04/08/2021 14:18:41 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 60 speed: 86.54082807730647 samples/sec
[04/08/2021 14:18:44 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 80 speed: 86.55131970104402 samples/sec
[04/08/2021 14:18:48 INFO 140384837293888] #progress_notice. epoch: 3, iterations: 100 speed: 94.12050882876818 samples/sec
[04/08/2021 14:18:51 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 3, validation pixel_accuracy: 0.9188516002415459 .
[04/08/2021 14:18:51 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 3, validation mIOU: 0.5873066990168427 .
[04/08/2021 14:18:51 INFO 140384837293888] #throughput_metric. host: algo-1, epoch: 3, validation throughput: 87.3669927616161 samples/sec.
[04/08/2021 14:18:51 INFO 140384837293888] Serializing model to /opt/ml/model/model_best.params
[04/08/2021 14:18:51 INFO 140384837293888] #progress_metric: host=algo-1, completed 40.0 % of epochs
#metrics {"StartTime": 1617891407.6567984, "EndTime": 1617891531.410196, "Dimensions": {"Algorithm": "AWS/Semantic Segmentation", "Host": "algo-1", "Operation": "training", "epoch": 3, "Meta": "training_data_iter"}, "Metrics": {"Total Records Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Total Batches Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Records Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Batches Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Reset Count": {"sum": 4.0, "count": 1, "min": 4, "max": 4}, "Number of Records Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Number of Batches Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}}}

[04/08/2021 14:18:58 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 20 speed: 56.649114795115146 samples/sec learning_rate: 0.000063
[04/08/2021 14:19:04 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 40 speed: 56.8898051845114 samples/sec learning_rate: 0.000062
[04/08/2021 14:19:10 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 60 speed: 56.505685186183115 samples/sec learning_rate: 0.000061
[04/08/2021 14:19:15 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 80 speed: 56.57895473438383 samples/sec learning_rate: 0.000061
[04/08/2021 14:19:21 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 100 speed: 56.40931446098395 samples/sec learning_rate: 0.000060
[04/08/2021 14:19:27 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 120 speed: 55.962294234393504 samples/sec learning_rate: 0.000060
[04/08/2021 14:19:33 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 140 speed: 56.6350593070506 samples/sec learning_rate: 0.000059
[04/08/2021 14:19:38 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 160 speed: 56.64385512229996 samples/sec learning_rate: 0.000059
[04/08/2021 14:19:44 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 180 speed: 56.463420154357834 samples/sec learning_rate: 0.000058
[04/08/2021 14:19:50 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 200 speed: 56.37552734818001 samples/sec learning_rate: 0.000058
[04/08/2021 14:19:56 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 220 speed: 56.21995771075092 samples/sec learning_rate: 0.000057
[04/08/2021 14:20:02 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 240 speed: 55.7649373786795 samples/sec learning_rate: 0.000056
[04/08/2021 14:20:07 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 260 speed: 56.45980987924581 samples/sec learning_rate: 0.000056
[04/08/2021 14:20:13 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 280 speed: 56.68246471116877 samples/sec learning_rate: 0.000055
[04/08/2021 14:20:19 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 300 speed: 56.67523636194731 samples/sec learning_rate: 0.000055
[04/08/2021 14:20:25 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 320 speed: 57.21803929192011 samples/sec learning_rate: 0.000054
[04/08/2021 14:20:30 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 340 speed: 56.92522574105412 samples/sec learning_rate: 0.000054
[04/08/2021 14:20:32 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 4, train loss: 0.3702078983618345 .
[04/08/2021 14:20:32 INFO 140384837293888] #throughput_metric. host: algo-1, epoch: 4, train throughput: 56.27458040282141 samples/sec.
[04/08/2021 14:20:37 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 20 speed: 81.860443329804 samples/sec
[04/08/2021 14:20:40 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 40 speed: 88.40023816172868 samples/sec
[04/08/2021 14:20:44 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 60 speed: 86.66387381547037 samples/sec
[04/08/2021 14:20:48 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 80 speed: 93.22540063568269 samples/sec
[04/08/2021 14:20:51 INFO 140384837293888] #progress_notice. epoch: 4, iterations: 100 speed: 94.1116653601224 samples/sec
[04/08/2021 14:20:54 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 4, validation pixel_accuracy: 0.9132687198067633 .
[04/08/2021 14:20:54 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 4, validation mIOU: 0.5925078958620056 .
[04/08/2021 14:20:54 INFO 140384837293888] #throughput_metric. host: algo-1, epoch: 4, validation throughput: 87.73625988248293 samples/sec.
[04/08/2021 14:20:54 INFO 140384837293888] Serializing model to /opt/ml/model/model_best.params
[04/08/2021 14:20:54 INFO 140384837293888] #progress_metric: host=algo-1, completed 50.0 % of epochs
#metrics {"StartTime": 1617891531.4103732, "EndTime": 1617891654.8837588, "Dimensions": {"Algorithm": "AWS/Semantic Segmentation", "Host": "algo-1", "Operation": "training", "epoch": 4, "Meta": "training_data_iter"}, "Metrics": {"Total Records Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Total Batches Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Records Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Batches Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Reset Count": {"sum": 5.0, "count": 1, "min": 5, "max": 5}, "Number of Records Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Number of Batches Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}}}

[04/08/2021 14:21:02 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 20 speed: 55.36157968374647 samples/sec learning_rate: 0.000053
[04/08/2021 14:21:07 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 40 speed: 56.626266223054394 samples/sec learning_rate: 0.000052
[04/08/2021 14:21:13 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 60 speed: 56.61513541968968 samples/sec learning_rate: 0.000052
[04/08/2021 14:21:19 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 80 speed: 56.34007785825164 samples/sec learning_rate: 0.000051
[04/08/2021 14:21:25 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 100 speed: 56.917645701801106 samples/sec learning_rate: 0.000051
[04/08/2021 14:21:30 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 120 speed: 55.691171418186286 samples/sec learning_rate: 0.000050
[04/08/2021 14:21:36 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 140 speed: 56.795821869173594 samples/sec learning_rate: 0.000050
[04/08/2021 14:21:42 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 160 speed: 56.62492838441138 samples/sec learning_rate: 0.000049
[04/08/2021 14:21:48 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 180 speed: 56.71940113575952 samples/sec learning_rate: 0.000048
[04/08/2021 14:21:53 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 200 speed: 56.58205548693346 samples/sec learning_rate: 0.000048
[04/08/2021 14:21:59 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 220 speed: 56.4682187345385 samples/sec learning_rate: 0.000047
[04/08/2021 14:22:05 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 240 speed: 55.74937196783412 samples/sec learning_rate: 0.000047
[04/08/2021 14:22:11 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 260 speed: 56.72208581230766 samples/sec learning_rate: 0.000046
[04/08/2021 14:22:16 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 280 speed: 56.511109576652125 samples/sec learning_rate: 0.000046
[04/08/2021 14:22:22 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 300 speed: 56.95924005530512 samples/sec learning_rate: 0.000045
[04/08/2021 14:22:28 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 320 speed: 57.042853839981504 samples/sec learning_rate: 0.000044
[04/08/2021 14:22:34 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 340 speed: 56.92846114954853 samples/sec learning_rate: 0.000044
[04/08/2021 14:22:36 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 5, train loss: 0.353750081115566 .
[04/08/2021 14:22:36 INFO 140384837293888] #throughput_metric. host: algo-1, epoch: 5, train throughput: 56.28865661797028 samples/sec.
[04/08/2021 14:22:40 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 20 speed: 91.83559904208005 samples/sec
[04/08/2021 14:22:44 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 40 speed: 92.12669797553954 samples/sec
[04/08/2021 14:22:48 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 60 speed: 90.22264168192586 samples/sec
[04/08/2021 14:22:52 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 80 speed: 89.68416100924787 samples/sec
[04/08/2021 14:22:55 INFO 140384837293888] #progress_notice. epoch: 5, iterations: 100 speed: 85.58232161439736 samples/sec
[04/08/2021 14:22:58 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 5, validation pixel_accuracy: 0.9186298686594203 .
[04/08/2021 14:22:58 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 5, validation mIOU: 0.5918629009661484 .
[04/08/2021 14:22:58 INFO 140384837293888] #throughput_metric. host: algo-1, epoch: 5, validation throughput: 87.88259747988985 samples/sec.
[04/08/2021 14:22:58 INFO 140384837293888] #progress_metric: host=algo-1, completed 60.0 % of epochs
#metrics {"StartTime": 1617891654.8839195, "EndTime": 1617891778.3679767, "Dimensions": {"Algorithm": "AWS/Semantic Segmentation", "Host": "algo-1", "Operation": "training", "epoch": 5, "Meta": "training_data_iter"}, "Metrics": {"Total Records Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Total Batches Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Records Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Batches Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Reset Count": {"sum": 6.0, "count": 1, "min": 6, "max": 6}, "Number of Records Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Number of Batches Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}}}

[04/08/2021 14:23:05 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 20 speed: 55.76498371727979 samples/sec learning_rate: 0.000043
[04/08/2021 14:23:11 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 40 speed: 56.620533073920114 samples/sec learning_rate: 0.000043
[04/08/2021 14:23:17 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 60 speed: 56.813275788211826 samples/sec learning_rate: 0.000042
[04/08/2021 14:23:22 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 80 speed: 56.32258254029134 samples/sec learning_rate: 0.000041
[04/08/2021 14:23:28 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 100 speed: 56.623208399040486 samples/sec learning_rate: 0.000041
[04/08/2021 14:23:34 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 120 speed: 56.4455633002583 samples/sec learning_rate: 0.000040
[04/08/2021 14:23:40 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 140 speed: 56.894483574841544 samples/sec learning_rate: 0.000040
[04/08/2021 14:23:45 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 160 speed: 56.29059895016382 samples/sec learning_rate: 0.000039
[04/08/2021 14:23:51 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 180 speed: 56.78717101004349 samples/sec learning_rate: 0.000039
[04/08/2021 14:23:57 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 200 speed: 56.45316058087761 samples/sec learning_rate: 0.000038
[04/08/2021 14:24:03 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 220 speed: 56.50330639607038 samples/sec learning_rate: 0.000037
[04/08/2021 14:24:09 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 240 speed: 56.77146202313195 samples/sec learning_rate: 0.000037
[04/08/2021 14:24:14 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 260 speed: 56.60434319800131 samples/sec learning_rate: 0.000036
[04/08/2021 14:24:20 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 280 speed: 56.72774364137248 samples/sec learning_rate: 0.000036
[04/08/2021 14:24:26 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 300 speed: 56.394050751303155 samples/sec learning_rate: 0.000035
[04/08/2021 14:24:32 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 320 speed: 57.07652346972622 samples/sec learning_rate: 0.000034
[04/08/2021 14:24:37 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 340 speed: 56.87543720925108 samples/sec learning_rate: 0.000034
[04/08/2021 14:24:39 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 6, train loss: 0.3403322241921246 .
[04/08/2021 14:24:39 INFO 140384837293888] #throughput_metric. host: algo-1, epoch: 6, train throughput: 56.19899449722635 samples/sec.
[04/08/2021 14:24:44 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 20 speed: 92.31019280790875 samples/sec
[04/08/2021 14:24:48 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 40 speed: 90.6575165553073 samples/sec
[04/08/2021 14:24:52 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 60 speed: 90.23550102997413 samples/sec
[04/08/2021 14:24:55 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 80 speed: 92.62271786236785 samples/sec
[04/08/2021 14:24:59 INFO 140384837293888] #progress_notice. epoch: 6, iterations: 100 speed: 90.10283793164099 samples/sec
[04/08/2021 14:25:02 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 6, validation pixel_accuracy: 0.923598892285628 .
[04/08/2021 14:25:02 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 6, validation mIOU: 0.6031820774781124 .
[04/08/2021 14:25:02 INFO 140384837293888] #throughput_metric. host: algo-1, epoch: 6, validation throughput: 86.55294171374103 samples/sec.
[04/08/2021 14:25:02 INFO 140384837293888] Serializing model to /opt/ml/model/model_best.params
[04/08/2021 14:25:02 INFO 140384837293888] #progress_metric: host=algo-1, completed 70.0 % of epochs
#metrics {"StartTime": 1617891778.3681593, "EndTime": 1617891902.3668137, "Dimensions": {"Algorithm": "AWS/Semantic Segmentation", "Host": "algo-1", "Operation": "training", "epoch": 6, "Meta": "training_data_iter"}, "Metrics": {"Total Records Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Total Batches Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Records Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Batches Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Reset Count": {"sum": 7.0, "count": 1, "min": 7, "max": 7}, "Number of Records Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Number of Batches Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}}}

[04/08/2021 14:25:09 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 20 speed: 56.43886989152721 samples/sec learning_rate: 0.000033
[04/08/2021 14:25:15 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 40 speed: 56.722613189350746 samples/sec learning_rate: 0.000032
[04/08/2021 14:25:21 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 60 speed: 56.17275822035395 samples/sec learning_rate: 0.000032
[04/08/2021 14:25:26 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 80 speed: 57.11674618832249 samples/sec learning_rate: 0.000031
[04/08/2021 14:25:32 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 100 speed: 56.65523637140327 samples/sec learning_rate: 0.000031
[04/08/2021 14:25:38 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 120 speed: 56.83305061707798 samples/sec learning_rate: 0.000030
[04/08/2021 14:25:44 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 140 speed: 56.83396511482163 samples/sec learning_rate: 0.000030
[04/08/2021 14:25:49 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 160 speed: 56.48223906085221 samples/sec learning_rate: 0.000029
[04/08/2021 14:25:55 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 180 speed: 56.971328784718416 samples/sec learning_rate: 0.000028
[04/08/2021 14:26:01 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 200 speed: 56.49312743704921 samples/sec learning_rate: 0.000028
[04/08/2021 14:26:07 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 220 speed: 55.866045755823535 samples/sec learning_rate: 0.000027
[04/08/2021 14:26:12 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 240 speed: 56.66887119710056 samples/sec learning_rate: 0.000027
[04/08/2021 14:26:18 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 260 speed: 55.93183718067431 samples/sec learning_rate: 0.000026
[04/08/2021 14:26:24 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 280 speed: 56.346842474701845 samples/sec learning_rate: 0.000025
[04/08/2021 14:26:30 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 300 speed: 56.74246887608946 samples/sec learning_rate: 0.000025
[04/08/2021 14:26:35 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 320 speed: 56.981535666069476 samples/sec learning_rate: 0.000024
[04/08/2021 14:26:41 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 340 speed: 57.14558789593213 samples/sec learning_rate: 0.000023
[04/08/2021 14:26:43 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 7, train loss: 0.32668686866416713 .
[04/08/2021 14:26:43 INFO 140384837293888] #throughput_metric. host: algo-1, epoch: 7, train throughput: 56.31134651959459 samples/sec.
[04/08/2021 14:26:48 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 20 speed: 91.44391998691884 samples/sec
[04/08/2021 14:26:52 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 40 speed: 91.07262048225472 samples/sec
[04/08/2021 14:26:55 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 60 speed: 88.24853606994778 samples/sec
[04/08/2021 14:26:59 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 80 speed: 83.17246029070574 samples/sec
[04/08/2021 14:27:03 INFO 140384837293888] #progress_notice. epoch: 7, iterations: 100 speed: 89.48397368371927 samples/sec
[04/08/2021 14:27:06 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 7, validation pixel_accuracy: 0.9227172103238745 .
[04/08/2021 14:27:06 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 7, validation mIOU: 0.6029484334576757 .
[04/08/2021 14:27:06 INFO 140384837293888] #throughput_metric. host: algo-1, epoch: 7, validation throughput: 86.33042265589704 samples/sec.
[04/08/2021 14:27:06 INFO 140384837293888] #progress_metric: host=algo-1, completed 80.0 % of epochs
#metrics {"StartTime": 1617891902.3670013, "EndTime": 1617892026.3234072, "Dimensions": {"Algorithm": "AWS/Semantic Segmentation", "Host": "algo-1", "Operation": "training", "epoch": 7, "Meta": "training_data_iter"}, "Metrics": {"Total Records Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Total Batches Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Records Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Batches Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Reset Count": {"sum": 8.0, "count": 1, "min": 8, "max": 8}, "Number of Records Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Number of Batches Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}}}

[04/08/2021 14:27:13 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 20 speed: 56.477913404116364 samples/sec learning_rate: 0.000023
[04/08/2021 14:27:19 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 40 speed: 56.10883844126657 samples/sec learning_rate: 0.000022
[04/08/2021 14:27:25 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 60 speed: 56.608974753707564 samples/sec learning_rate: 0.000021
[04/08/2021 14:27:30 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 80 speed: 55.95249585205813 samples/sec learning_rate: 0.000021
[04/08/2021 14:27:36 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 100 speed: 56.82506204629573 samples/sec learning_rate: 0.000020
[04/08/2021 14:27:42 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 120 speed: 56.674087655041426 samples/sec learning_rate: 0.000020
[04/08/2021 14:27:48 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 140 speed: 57.03495160318435 samples/sec learning_rate: 0.000019
[04/08/2021 14:27:53 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 160 speed: 56.17727238862506 samples/sec learning_rate: 0.000018
[04/08/2021 14:27:59 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 180 speed: 56.932518002212525 samples/sec learning_rate: 0.000018
[04/08/2021 14:28:05 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 200 speed: 55.56487273516113 samples/sec learning_rate: 0.000017
[04/08/2021 14:28:11 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 220 speed: 56.64591105994051 samples/sec learning_rate: 0.000016
[04/08/2021 14:28:17 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 240 speed: 56.657388802399055 samples/sec learning_rate: 0.000016
[04/08/2021 14:28:22 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 260 speed: 56.74059781979165 samples/sec learning_rate: 0.000015
[04/08/2021 14:28:28 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 280 speed: 56.59646651930478 samples/sec learning_rate: 0.000015
[04/08/2021 14:28:34 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 300 speed: 56.043847056416425 samples/sec learning_rate: 0.000014
[04/08/2021 14:28:40 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 320 speed: 56.53491306904436 samples/sec learning_rate: 0.000013
[04/08/2021 14:28:45 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 340 speed: 56.89795669204551 samples/sec learning_rate: 0.000013
[04/08/2021 14:28:47 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 8, train loss: 0.32176074157388224 .
[04/08/2021 14:28:47 INFO 140384837293888] #throughput_metric. host: algo-1, epoch: 8, train throughput: 56.46027047391172 samples/sec.
[04/08/2021 14:28:52 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 20 speed: 87.08673523679013 samples/sec
[04/08/2021 14:28:56 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 40 speed: 91.93095814748655 samples/sec
[04/08/2021 14:28:59 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 60 speed: 87.75073192227175 samples/sec
[04/08/2021 14:29:03 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 80 speed: 85.05785832069041 samples/sec
[04/08/2021 14:29:07 INFO 140384837293888] #progress_notice. epoch: 8, iterations: 100 speed: 94.26143629274213 samples/sec
[04/08/2021 14:29:09 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 8, validation pixel_accuracy: 0.9255205974486715 .
[04/08/2021 14:29:09 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 8, validation mIOU: 0.6051743316208571 .
[04/08/2021 14:29:09 INFO 140384837293888] #throughput_metric. host: algo-1, epoch: 8, validation throughput: 86.66288214431202 samples/sec.
[04/08/2021 14:29:09 INFO 140384837293888] Serializing model to /opt/ml/model/model_best.params
[04/08/2021 14:29:10 INFO 140384837293888] #progress_metric: host=algo-1, completed 90.0 % of epochs
#metrics {"StartTime": 1617892026.3236125, "EndTime": 1617892150.1251485, "Dimensions": {"Algorithm": "AWS/Semantic Segmentation", "Host": "algo-1", "Operation": "training", "epoch": 8, "Meta": "training_data_iter"}, "Metrics": {"Total Records Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Total Batches Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Records Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Batches Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Reset Count": {"sum": 9.0, "count": 1, "min": 9, "max": 9}, "Number of Records Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Number of Batches Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}}}

[04/08/2021 14:29:17 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 20 speed: 56.06205943631239 samples/sec learning_rate: 0.000012
[04/08/2021 14:29:23 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 40 speed: 56.75916992009094 samples/sec learning_rate: 0.000011
[04/08/2021 14:29:28 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 60 speed: 56.35313549921359 samples/sec learning_rate: 0.000010
[04/08/2021 14:29:34 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 80 speed: 56.78707490418976 samples/sec learning_rate: 0.000010
[04/08/2021 14:29:40 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 100 speed: 56.70190902193254 samples/sec learning_rate: 0.000009
[04/08/2021 14:29:46 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 120 speed: 56.1782129316717 samples/sec learning_rate: 0.000008
[04/08/2021 14:29:51 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 140 speed: 56.546059993259185 samples/sec learning_rate: 0.000008
[04/08/2021 14:29:57 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 160 speed: 56.675427817625504 samples/sec learning_rate: 0.000007
[04/08/2021 14:30:03 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 180 speed: 56.76310665487007 samples/sec learning_rate: 0.000006
[04/08/2021 14:30:09 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 200 speed: 56.48409311658427 samples/sec learning_rate: 0.000006
[04/08/2021 14:30:14 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 220 speed: 57.04256292128146 samples/sec learning_rate: 0.000005
[04/08/2021 14:30:20 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 240 speed: 56.29503763132376 samples/sec learning_rate: 0.000004
[04/08/2021 14:30:26 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 260 speed: 56.48266690906397 samples/sec learning_rate: 0.000003
[04/08/2021 14:30:32 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 280 speed: 56.75316985054154 samples/sec learning_rate: 0.000003
[04/08/2021 14:30:37 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 300 speed: 57.008013197608186 samples/sec learning_rate: 0.000002
[04/08/2021 14:30:43 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 320 speed: 56.5426773423565 samples/sec learning_rate: 0.000001
[04/08/2021 14:30:49 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 340 speed: 57.36153335931784 samples/sec learning_rate: 0.000000
[04/08/2021 14:30:51 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 9, train loss: 0.3187850824951782 .
[04/08/2021 14:30:51 INFO 140384837293888] #throughput_metric. host: algo-1, epoch: 9, train throughput: 56.41648647817144 samples/sec.
[04/08/2021 14:30:56 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 20 speed: 87.02078627353855 samples/sec
[04/08/2021 14:30:59 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 40 speed: 86.92835510788832 samples/sec
[04/08/2021 14:31:03 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 60 speed: 87.20317996177081 samples/sec
[04/08/2021 14:31:07 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 80 speed: 89.80645851984782 samples/sec
[04/08/2021 14:31:11 INFO 140384837293888] #progress_notice. epoch: 9, iterations: 100 speed: 91.3648652512202 samples/sec
[04/08/2021 14:31:13 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 9, validation pixel_accuracy: 0.9259853845863527 .
[04/08/2021 14:31:13 INFO 140384837293888] #quality_metric. host: algo-1, epoch: 9, validation mIOU: 0.6064410985692649 .
[04/08/2021 14:31:13 INFO 140384837293888] #throughput_metric. host: algo-1, epoch: 9, validation throughput: 86.15235516003033 samples/sec.
[04/08/2021 14:31:13 INFO 140384837293888] Serializing model to /opt/ml/model/model_best.params
[04/08/2021 14:31:14 INFO 140384837293888] #progress_metric: host=algo-1, completed 100.0 % of epochs
#metrics {"StartTime": 1617892150.125332, "EndTime": 1617892274.1432915, "Dimensions": {"Algorithm": "AWS/Semantic Segmentation", "Host": "algo-1", "Operation": "training", "epoch": 9, "Meta": "training_data_iter"}, "Metrics": {"Total Records Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Total Batches Seen": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Records Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Max Batches Seen Between Resets": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Reset Count": {"sum": 10.0, "count": 1, "min": 10, "max": 10}, "Number of Records Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}, "Number of Batches Since Last Reset": {"sum": 0.0, "count": 1, "min": 0, "max": 0}}}

[04/08/2021 14:31:14 WARNING 140384837293888] wait_for_all_workers will not sync workers since the kv store is not running distributed
[04/08/2021 14:31:14 INFO 140384837293888] Serializing model to /opt/ml/model/model_algo-1
[04/08/2021 14:31:14 INFO 140384837293888] Test data is not provided.
#metrics {"StartTime": 1617891021.4946072, "EndTime": 1617892274.53571, "Dimensions": {"Algorithm": "AWS/Semantic Segmentation", "Host": "algo-1", "Operation": "training"}, "Metrics": {"epochs": {"sum": 10.0, "count": 1, "min": 10, "max": 10}, "setuptime": {"sum": 18.151521682739258, "count": 1, "min": 18.151521682739258, "max": 18.151521682739258}, "totaltime": {"sum": 1254471.3032245636, "count": 1, "min": 1254471.3032245636, "max": 1254471.3032245636}}}


2021-04-08 14:31:29 Uploading - Uploading generated training model
2021-04-08 14:32:10 Completed - Training job completed
Training seconds: 1460
Billable seconds: 1460
CPU times: user 3.87 s, sys: 133 ms, total: 4 s
Wall time: 27min 3s

10. Deploy Model

%%time
model_deployed = model.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')
print('\nmodel deployed ...')
-----------------!
model deployed ...
CPU times: user 314 ms, sys: 5.7 ms, total: 320 ms
Wall time: 8min 32s

11. Predict with Validation Data

x_fold = 'validation'
xs = [img for img in os.listdir(x_fold) if img[-3:] == 'jpg']
print(len(xs))
1842
y_fold = 'validation_annotation'
ys = [img for img in os.listdir(y_fold) if img[-3:] == 'png']
print(len(ys))
1842
model_deployed.serializer = sagemaker.serializers.IdentitySerializer('image/jpeg')
model_deployed.deserializer = sagemaker.deserializers.BytesDeserializer(accept='image/png')
num_examples = 5
plt.figure(figsize=(15, 15))
for i in range(0, num_examples):
    ix = random.randint(0, len(xs) - 1)
    x_name = xs[ix]
    y_name = xs[ix].split('.')[0] + '.png'

    plt.subplot(num_examples, 3, 1 + i*3)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(plt.imread(os.path.join(x_fold, x_name)))
    
    plt.subplot(num_examples, 3, 2 + i*3)
    plt.xticks([])
    plt.yticks([])
    plt.imshow(plt.imread(os.path.join(y_fold, y_name)))
    
    plt.subplot(num_examples, 3, 3 + i*3)
    plt.xticks([])
    plt.yticks([])
    x_path = os.path.join(x_fold, x_name)
    with open(x_path, 'rb') as f:
        b = bytearray(f.read())
    results = model_deployed.predict(b)
    yh = np.array(Image.open(io.BytesIO(results)))
    plt.imshow(yh);

In the above graphic:

Column 1: Input image (x)

Column 2: Label map (y)

Column 3: Predicted segmentation (yh)

12. Cleanup

# do this else you continue to accrue costs!
model_deployed.delete_endpoint()