Source code for pywick.optimizers.eve

# Source: https://github.com/moskomule/eve.pytorch

import math
from torch.optim.optimizer import Optimizer


[docs]class Eve(Optimizer): """ Implementation of `Eve: A Gradient Based Optimization Method with Locally and Globally Adaptive Learning Rates <https://arxiv.org/pdf/1611.01505.pdf>`_ """ def __init__(self, params, lr=1e-3, betas=(0.9, 0.999, 0.999), eps=1e-8, k=0.1, K=10, weight_decay=0): defaults = dict(lr=lr, betas=betas, eps=eps, k=k, K=K, weight_decay=weight_decay) super(Eve, self).__init__(params, defaults)
[docs] def step(self, closure): """ :param closure: (closure). see http://pytorch.org/docs/optim.html#optimizer-step-closure :return: loss """ loss = closure() _loss = loss.item() # float for group in self.param_groups: for p in group['params']: grad = p.grad.data state = self.state[p] # State initialization if len(state) == 0: state['step'] = 0 # Exponential moving average of gradient values state['m_t'] = grad.new().resize_as_(grad).zero_() # Exponential moving average of squared gradient values state['v_t'] = grad.new().resize_as_(grad).zero_() # f hats, smoothly tracked objective functions # \hat{f}_0 = f_0 state['ft_2'], state['ft_1'] = _loss, None state['d'] = 1 m_t, v_t = state['m_t'], state['v_t'] beta1, beta2, beta3 = group['betas'] k, K = group['k'], group['K'] d = state['d'] state['step'] += 1 t = state['step'] # initialization of \hat{f}_1 if t == 1: # \hat{f}_1 = f_1 state['ft_1'] = _loss # \hat{f_{t-1}}, \hat{f_{t-2}} ft_1, ft_2 = state['ft_1'], state['ft_2'] # f(\theta_{t-1}) f = _loss if group['weight_decay'] != 0: grad = grad.add(group['weight_decay'], p.data) # Decay the first and second moment running average coefficient m_t.mul_(beta1).add_(1 - beta1, grad) v_t.mul_(beta2).addcmul_(1 - beta2, grad, grad) m_t_hat = m_t / (1 - beta1 ** t) v_t_hat = v_t / (1 - beta2 ** t) if t > 1: if f >= state['ft_2']: delta = k + 1 Delta = K + 1 else: delta = 1 / (K + 1) Delta = 1 / (k + 1) c = min(max(delta, f / ft_2), Delta) r = abs(c - 1) / min(c, 1) state['ft_1'], state['ft_2'] = c * ft_2, ft_1 state['d'] = beta3 * d + (1 - beta3) * r # update parameters p.data.addcdiv_(-group['lr'] / state['d'], m_t_hat, v_t_hat.sqrt().add_(group['eps'])) return loss