Source code for pywick.callbacks.ModelCheckpoint

import json
import math
import os
import shutil

import torch

from . import Callback

__all__ = ['ModelCheckpoint']

[docs]class ModelCheckpoint(Callback): """ Model Checkpoint to save model weights during training. 'Best' is determined by minimizing (or maximizing) the value found under monitored_log_key in the logs Saved checkpoints contain these keys by default: 'run_id' 'epoch' 'loss_type' 'loss_val' 'best_epoch' - plus any additional key/value pairs produced by custom_func Additionally saves a .json file with statistics about the run such as: 'run_id' 'num_epochs' 'best_epoch' 'best_loss_or_gain' 'metric_name' - plus any additional key/value pairs produced by custom_func :param run_id: (string): Uniquely identifies the run :param monitored_log_key: (string): Name of the key in the logs that will contain the value we want to minimize (and thus that will dictate whether the model is 'best') :param save_dir: (string): Path indicating where to save the checkpoint :param addl_k_v: (dict): dictionary of additional key/value pairs to save with the model. Typically these include some initialization parameters, name of the model etc. (e.g. from the initialization dictionary 'opt'), as well as other useful params (e.g. mean, std, proc_type: gpu/cpu etc) :param epoch_log_keys: (list): list of keys to save from the epoch log dictionary (Note: the logs dictionary is automatically provided by the learning framework) :param save_interval: (int): How often to save the model (if none then will default to every 5 iterations) :param save_best_only: (bool): Whether only to save the best result (and overwrite all previous) Default: False :param max_saves: (integer > 0 or -1): the max number of models to save. Older model checkpoints will be overwritten if necessary. Set equal to -1 to have no limit. Default: 5 :param custom_func: func(k_v_dict, logs, out_dict, monitored_log_key, is_end_training): Custom function for performing any additional logic (to add values to the model). The function will be passed the addl_k_v dictionary, the event logs dictionary, an output dictionary to process, the monitored_log_key and a bool indicating whether the training is finished. The function is expected to modify the output dictionary in order to preserve values across epochs. The function will be called at the end of each epoch and at the end of the training (with is_end_traing = True) :param do_minimize: (bool): whether to minimize or maximize the 'monitored_log_key' value :param verbose: (bool): verbosity of the console output Default: False """ def __init__(self, run_id, monitored_log_key, save_dir, addl_k_v=None, epoch_log_keys=None, save_interval=5, save_best_only=False, max_saves=5, custom_func=None, do_minimize=True, verbose=False, **kwargs): super().__init__(**kwargs) if addl_k_v is None: addl_k_v = {} if epoch_log_keys is None: epoch_log_keys = [] self.run_id = run_id self.addl_k_v = addl_k_v self.save_dir = os.path.expanduser(save_dir) self.save_interval = save_interval self.epoch_log_keys = epoch_log_keys self.save_best_only = save_best_only self.max_saves = max_saves self.custom_func = custom_func self.custom_func_dict = {} # this is expected to be filled by the custom_func self.verbose = verbose self.monitored_log_key = monitored_log_key # 'e.g. dice_coeff' self.do_minimize = do_minimize self.last_saved_ep = 0 self.last_epoch_logs = None self.last_epoch = -1 self.best_epoch = -1 # keep track of old files if necessary if self.max_saves > 0: self.old_files = [] # mode = 'min' only supported if do_minimize: self.best_loss = math.inf else: self.best_loss = -89293.923 def on_epoch_end(self, epoch, logs=None): self.last_epoch_logs = logs self.last_epoch = epoch if (epoch + 1) % self.save_interval == 0: # only save with given frequency current_loss = logs.get(self.monitored_log_key) if (current_loss < self.best_loss and self.save_best_only) or not self.save_best_only or (not self.do_minimize and current_loss > self.best_loss): if current_loss is None: if self.verbose: print(f'ModelCheckpoint could not find monitored_log_key (loss variable) in logs: {self.monitored_log_key}') else: # Call custom function (if set) to process things like best-N results etc if self.custom_func is not None: self.custom_func(self.addl_k_v, logs, self.custom_func_dict, False) checkpt_name = generate_checkpoint_name(self.run_id, self.addl_k_v, epoch, False) if self.verbose: print('\nEpoch %i: loss metric changed from %0.4f to %0.4f saving model to %s' % ( epoch + 1, self.best_loss, current_loss, os.path.join(self.save_dir, checkpt_name))) if (self.do_minimize and current_loss < self.best_loss) or (not self.do_minimize and current_loss > self.best_loss): self.best_loss = current_loss self.best_epoch = epoch # print('Best Loss of {} saved at epoch: {}'.format(self.best_loss, epoch + 1)) save_dict = { 'run_id': self.run_id, 'epoch': epoch + 1, 'metric_type': self.monitored_log_key, 'metric_value': current_loss, 'best_epoch': self.best_epoch + 1 } # correctly handle saving parallelized models ( if isinstance(self.trainer.model, torch.nn.DataParallel): save_dict['state_dict'] = self.trainer.model.module.state_dict() else: save_dict['state_dict'] = self.trainer.model.state_dict() # add values from other dictionaries save_dict.update(self.addl_k_v) save_dict.update(self.custom_func_dict) for key in self.epoch_log_keys: save_dict[key] = logs.get(key) # this is not guaranteed to be found so may return 'None' save_checkpoint(save_dict, is_best=(self.best_epoch == epoch), save_path=self.save_dir, filename=checkpt_name) self.last_saved_ep = epoch if self.max_saves > 0: if len(self.old_files) >= self.max_saves: try: os.remove(self.old_files[0]) if self.verbose: print(f'ModelCheckpoint removing old model snapshot: {self.old_files[0]}') except: pass self.old_files = self.old_files[1:] self.old_files.append(os.path.join(self.save_dir, checkpt_name)) def on_train_end(self, logs=None): final_epoch = self.last_epoch current_loss = self.last_epoch_logs[self.monitored_log_key] ## Save model if it hasn't been previously saved and it has best loss value if self.last_saved_ep < final_epoch and ((self.do_minimize and current_loss < self.best_loss) or (not self.do_minimize and current_loss > self.best_loss)): # Call custom function (if set) to process things like best-N results etc if self.custom_func is not None: self.custom_func(self.addl_k_v, self.last_epoch_logs, self.custom_func_dict, False) self.best_loss = current_loss self.best_epoch = final_epoch save_dict = { 'run_id': self.run_id, 'epoch': final_epoch + 1, 'state_dict': self.trainer.model.state_dict(), 'metric_type': self.monitored_log_key, 'metric_value': current_loss, 'best_epoch': self.best_epoch } # add values from other dictionaries save_dict.update(self.addl_k_v) save_dict.update(self.custom_func_dict) for key in self.epoch_log_keys: save_dict[key] = self.last_epoch_logs[key] save_checkpoint(save_dict, is_best=True, save_path=self.save_dir, filename=generate_checkpoint_name(self.run_id, self.addl_k_v, final_epoch, False)) self.last_saved_ep = final_epoch stats = {'run_id': self.run_id, 'num_epochs': final_epoch + 1, 'best_epoch': self.best_epoch + 1, 'best_loss_or_gain': self.best_loss, 'metric_type': self.monitored_log_key } stats.update(self.addl_k_v) stats.update(self.custom_func_dict) statsfile_path = generate_statsfile_name(self.run_id, self.save_dir) with open(statsfile_path, 'a') as statsfile: json.dump(stats, statsfile)
def generate_statsfile_name(run_id, save_dir): save_dir1 = os.path.expanduser(save_dir) return os.path.join(save_dir1, str(run_id) + "_stats.json") def generate_checkpoint_name(run_id, kv_dict, epoch, is_best): model_name = kv_dict.get('model_name', 'model') optimizer_name = kv_dict.get('optimizer', 'o') if is_best: return str(run_id) + "_" + model_name + "_" + optimizer_name + "_ep_best.pth.tar" else: return str(run_id) + "_" + model_name + "_" + optimizer_name + "_ep_" + str(epoch + 1) + ".pth.tar" def save_checkpoint(state, is_best=False, save_path=".", filename=None): """ Saves checkpoint to file. :param state: (dict): the dictionary to save. Can have other values besides just model weights. :param is_best: (bool): whether this is the best result we've seen thus far :param save_path: (string): local dir to save to :param filename: (string): name of the file to save under `save_path` :return: """ if not filename: print("ERROR: No filename defined. Checkpoint is NOT saved.") save_path1 = os.path.expanduser(save_path) if not os.path.exists(save_path1): os.makedirs(save_path1), os.path.join(save_path1, filename)) if is_best: pos = filename.find("_ep_") if pos and pos > 0: bestname = filename[:pos] + "_best.pth.tar" shutil.copyfile(os.path.join(save_path1, filename), os.path.join(save_path1, bestname))