arch.py

import torch
from torch import optim, autograd, nn

from model_search import Network

use_DataParallel = torch.cuda.device_count() > 1
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')


def concat(xs):
    """
    flatten all tensor from [d1,d2,...dn] to [d]
    and then concat all [d_1] to [d_1+d_2+d_3+...]
    :param xs:
    :return:
    """
    return torch.cat([x.view(-1) for x in xs])


class Arch:

    def __init__(self, model, criterion, args):
        """

        :param model: network
        :param args:
        """
        self.momentum = args.momentum  # momentum for optimizer of theta
        self.wd = args.wd  # weight decay for optimizer of theta
        self.model = model  # main model with respect to theta and alpha
        self.criterion = criterion
        # this is the optimizer to optimize alpha parameter
        self.optimizer = optim.Adam(
            self.model.module.arch_and_attn_parameters() if use_DataParallel else self.model.arch_and_attn_parameters(),
            lr=args.arch_lr,
            betas=(0.5, 0.999),
            weight_decay=args.arch_wd)

    def comp_unrolled_model(self, x, target, eta, optimizer):
        """
        loss on train set and then update w_pi, not-in-place
        :param x:
        :param target:
        :param eta:
        :param optimizer: optimizer of theta, not optimizer of alpha
        :return:
        """
        # forward to get loss
        logits = self.model(x)
        loss = self.criterion(logits, target)
        # flatten current weights
        theta = concat(self.model.parameters()).detach()
        # theta: torch.Size([1930618])
        # print('theta:', theta.shape)
        try:
            # fetch momentum data from theta optimizer
            moment = concat(optimizer.state[v]['momentum_buffer'] for v in self.model.parameters())
            moment.mul_(self.momentum)
        except:
            moment = torch.zeros_like(theta)

        # flatten all gradients
        dtheta = concat(autograd.grad(loss, self.model.parameters())).data
        # indeed, here we implement a simple SGD with momentum and weight decay
        # theta = theta - eta * (moment + weight decay + dtheta)
        theta = theta.sub(eta, moment + dtheta + self.wd * theta)
        # construct a new model
        unrolled_model = self.construct_model_from_theta(theta)

        return unrolled_model

    def step(self, x_train, target_train, x_valid, target_valid, eta, optimizer, unrolled):
        """
        update alpha parameter by manually computing the gradients
        :param x_train:
        :param target_train:
        :param x_valid:
        :param target_valid:
        :param eta:
        :param optimizer: theta optimizer
        :param unrolled:
        :return:
        """
        # alpha optimizer
        self.optimizer.zero_grad()

        # compute the gradient and write it into tensor.grad
        # instead of generated by loss.backward()
        if unrolled:
            self.backward_step_unrolled(x_train, target_train, x_valid, target_valid, eta, optimizer)
        else:
            # directly optimize alpha on w, instead of w_pi
            self.backward_step(x_valid, target_valid)

        self.optimizer.step()

    def backward_step(self, x_valid, target_valid):
        """
        simply train on validate set and backward
        :param x_valid:
        :param target_valid:
        :return:
        """
        logits = self.model(x_valid)
        loss = self.criterion(logits, target_valid)
        # both alpha and theta require grad but only alpha optimizer will
        # step in current phase.
        loss.backward()
        # print('back')

    def backward_step_unrolled(self, x_train, target_train, x_valid, target_valid, eta, optimizer):
        """
        train on validate set based on update w_pi
        :param x_train:
        :param target_train:
        :param x_valid:
        :param target_valid:
        :param eta: 0.01, according to author's comments
        :param optimizer: theta optimizer
        :return:
        """

        # theta_pi = theta - lr * grad
        unrolled_model = self.comp_unrolled_model(x_train, target_train, eta, optimizer)
        # calculate loss on theta_pi
        unrolled_logits = unrolled_model(x_valid)
        unrolled_loss = self.criterion(unrolled_logits, target_valid)

        # this will update theta_pi model, but NOT theta model
        unrolled_loss.backward()
        # grad(L(w', a), a), part of Eq. 6
        # dalpha = [v.grad for v in unrolled_model.arch_and_attn_parameters()]
        dalpha = [v.grad for v in unrolled_model.module.arch_and_attn_parameters()] if use_DataParallel else [v.grad for
                                                                                                              v in
                                                                                                              unrolled_model.arch_and_attn_parameters()]
        # vector = [v.grad.data for v in unrolled_model.parameters()]
        vector = [v.grad.data for v in unrolled_model.parameters()]
        implicit_grads = self.hessian_vector_product(vector, x_train, target_train)

        for g, ig in zip(dalpha, implicit_grads):
            # g = g - eta * ig, from Eq. 6
            g.data.sub_(eta, ig.data)

        # write updated alpha into original model
        if use_DataParallel:
            for v, g in zip(self.model.module.arch_and_attn_parameters(), dalpha):
                if v.grad is None:
                    v.grad = g.data
                else:
                    v.grad.data.copy_(g.data)
        else:
            for v, g in zip(self.model.arch_and_attn_parameters(), dalpha):
                if v.grad is None:
                    v.grad = g.data
                else:
                    v.grad.data.copy_(g.data)

    def construct_model_from_theta(self, theta):
        """
        construct a new model with initialized weight from theta
        it use .state_dict() and load_state_dict() instead of
        .parameters() + fill_()
        :param theta: flatten weights, need to reshape to original shape
        :return:
        """
        model_new = self.model.module.new() if use_DataParallel else self.model.new()
        model_dict = self.model.module.state_dict() if use_DataParallel else self.model.state_dict()

        params, offset = {}, 0
        for k, v in self.model.named_parameters():
            v_length = v.numel()
            # restore theta[] value to original shape
            name = k[7:] if use_DataParallel else k
            params[name] = theta[offset: offset + v_length].view(v.size())
            offset += v_length

        assert offset == len(theta)
        model_dict.update(params)

        def load_state_dict(model: Network, state_dict, strict=True):
            """Copies parameters and buffers from :attr:`state_dict` into
            this module and its descendants. If :attr:`strict` is ``True`` then
            the keys of :attr:`state_dict` must exactly match the keys returned
            by this module's :func:`state_dict()` function.

            Arguments:
                state_dict (dict): A dict containing parameters and
                    persistent buffers.
                strict (bool): Strictly enforce that the keys in :attr:`state_dict`
                    match the keys returned by this module's `:func:`state_dict()`
                    function.
                    :param strict:
                    :param state_dict:
                    :param model:
            """
            own_state = model.state_dict()
            for name, param in state_dict.items():
                if name in own_state:
                    if isinstance(param, torch.nn.Parameter):
                        # backwards compatibility for serialized parameters
                        param = param.detach()
                    try:
                        own_state[name].copy_(param)
                    except Exception:
                        raise RuntimeError('While copying the parameter named {}, '
                                           'whose dimensions in the model are {} and '
                                           'whose dimensions in the checkpoint are {}.'
                                           .format(name, own_state[name].size(), param.size()))
                elif strict:
                    raise KeyError('unexpected key "{}" in state_dict'
                                   .format(name))
            if strict:
                missing = set(own_state.keys()) - set(state_dict.keys())
                if len(missing) > 0:
                    raise KeyError('missing keys in state_dict: "{}"'.format(missing))

        # model_new.load_state_dict(model_dict)
        load_state_dict(model_new, model_dict)

        if use_DataParallel:
            model_new = nn.DataParallel(model_new)
        return model_new.to(device)

    def hessian_vector_product(self, vector, x, target, r=1e-2):
        """
        slightly touch vector value to estimate the gradient with respect to alpha
        refer to Eq. 7 for more details.
        :param vector: gradient.data of parameters theta
        :param x:
        :param target:
        :param r:
        :return:
        """
        R = r / concat(vector).norm()

        for p, v in zip(self.model.parameters(), vector):
            # w+ = w + R * v
            p.data.add_(R, v)
        logits = self.model(x)
        loss = self.criterion(logits, target)
        # gradient with respect to alpha
        grads_p = autograd.grad(loss,
                                self.model.module.arch_and_attn_parameters() if use_DataParallel else self.model.arch_and_attn_parameters())

        for p, v in zip(self.model.parameters(), vector):
            # w- = (w+R*v) - 2R*v
            p.data.sub_(2 * R, v)
        logits = self.model(x)
        loss = self.criterion(logits, target)
        grads_n = autograd.grad(loss,
                                self.model.module.arch_and_attn_parameters() if use_DataParallel else self.model.arch_and_attn_parameters())

        for p, v in zip(self.model.parameters(), vector):
            # w = (w+R*v) - 2R*v + R*v
            p.data.add_(R, v)

        h = [(x - y).div_(2 * R) for x, y in zip(grads_p, grads_n)]
        # h len: 2 h0 torch.Size([14, 8])
        # print('h len:', len(h), 'h0', h[0].shape)
        return h