Callbacks.py

"""
Module to do callbacks for Keras models.
"""
from __future__ import division

from keras.callbacks import Callback  # ModelCheckpoint , EarlyStopping

import matplotlib.pyplot as plt
import h5py
import itertools
import logging
import numpy as np
import os
import subprocess
import shutil
import codecs
import sys
from time import gmtime, strftime
import math
import time
from copy import deepcopy

# Set up logger
logging.basicConfig(level=logging.INFO, stream=sys.stdout)
#logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)

# Dimensionality of image feature vector
IMG_FEATS = 4096
HSN_SIZE = 409
MULTEVAL_DIR = '../multeval-0.5.1' if "util" in os.getcwd() else "multeval-0.5.1"


class cd:
    """Context manager for changing the current working directory"""
    """http://stackoverflow.com/questions/431684/how-do-i-cd-in-python"""
    def __init__(self, newPath):
        self.newPath = newPath

    def __enter__(self):
        self.savedPath = os.getcwd()
        os.chdir(self.newPath)

    def __exit__(self, etype, value, traceback):
        os.chdir(self.savedPath)

class CompilationOfCallbacks(Callback):
    """ Collection of compiled callbacks."""

    def __init__(self, word2index, index2word, argsDict, dataset,
                 data_generator, use_sourcelang=False, use_image=True):
        super(Callback, self).__init__()

        self.verbose = True
        self.filename = "weights.hdf5"
        self.save_best_only = True

        self.val_loss = []
        self.best_val_loss = np.inf

        self.val_metric = []
        self.best_val_metric = np.NINF

        self.word2index = word2index
        self.index2word = index2word
        self.args = argsDict

        # used to control early stopping on the validation data
        self.wait = 0
        self.patience = self.args.patience

        # needed by model.predict in generate_sentences
        self.use_sourcelang = use_sourcelang
        self.use_image = use_image

        # controversial assignment but it makes it much easier to
        # do early stopping based on metrics
        self.data_generator = data_generator

        # this results in two file handlers for dataset (here and
        # data_generator)
        if not dataset:
            logger.warn("No dataset given, using flickr8k")
            self.dataset = h5py.File("flickr8k/dataset.h5", "r")
        else:
            self.dataset = h5py.File("%s/dataset.h5" % dataset, "r")
        if self.args.source_vectors is not None:
            self.source_dataset = h5py.File("%s/dataset.h5" % self.args.source_vectors, "r")

    def on_epoch_end(self, epoch, logs={}):
        '''
        At the end of each epoch we
          1. create a directory to checkpoint data
          2. save the arguments used to initialise the run
          3. generate N sentences in the val data by sampling from the model
          4. calculate metric score of the generated sentences
          5. determine whether to stop training and sys.exit(0)
          6. save the model parameters using BLEU
        '''
        savetime = strftime("%d%m%Y-%H%M%S", gmtime())
        path = self.create_checkpoint_directory(savetime)
        self.save_run_arguments(path)

        # Generate training and val sentences to check for overfitting
        self.generate_sentences(path)
        meteor, bleu, ter = self.multeval_scores(path)
        val_loss = logs.get('val_loss')

        self.early_stop_decision(len(self.val_metric)+1, meteor, val_loss)
        self.checkpoint_parameters(epoch, logs, path, meteor, val_loss)
        self.log_performance()

    def early_stop_decision(self, epoch, val_metric, val_loss):
        '''
	Stop training if validation loss has stopped decreasing and
	validation BLEU score has not increased for --patience epochs.

        WARNING: quits with sys.exit(0).

	TODO: this doesn't yet support early stopping based on TER
        '''

	if val_loss < self.best_val_loss:
	    self.wait = 0
        elif val_metric > self.best_val_metric or self.args.no_early_stopping:
            self.wait = 0
        else:
            self.wait += 1
            if self.wait >= self.patience:
                # we have exceeded patience
                if val_loss > self.best_val_loss:
                    # and loss is no longer decreasing
                    logger.info("Epoch %d: early stopping", epoch)
                    handle = open("checkpoints/%s/summary"
                                  % self.args.run_string, "a")
                    handle.write("Early stopping because patience exceeded\n")
                    best_bleu = np.nanargmax(self.val_metric)
                    best_loss = np.nanargmin(self.val_loss)
                    logger.info("Best Metric: %d | val loss %.5f score %.2f",
                                best_bleu+1, self.val_loss[best_bleu],
                                self.val_metric[best_bleu])
                    logger.info("Best loss: %d | val loss %.5f score %.2f",
                                best_loss+1, self.val_loss[best_loss],
                                self.val_metric[best_loss])
                    handle.close()
                    sys.exit(0)

    def log_performance(self):
        '''
        Record model performance so far, based on validation loss.
        '''
        handle = open("checkpoints/%s/summary" % self.args.run_string, "w")

        for epoch in range(len(self.val_loss)):
            handle.write("Checkpoint %d | val loss: %.5f bleu %.2f\n"
                         % (epoch+1, self.val_loss[epoch],
                            self.val_metric[epoch]))

        logger.info("---")  # break up the presentation for clarity

        # BLEU is the quickest indicator of performance for our task
        # but loss is our objective function
        best_bleu = np.nanargmax(self.val_metric)
        best_loss = np.nanargmin(self.val_loss)
        logger.info("Best Metric: %d | val loss %.5f score %.2f",
                    best_bleu+1, self.val_loss[best_bleu],
                    self.val_metric[best_bleu])
        handle.write("Best Metric: %d | val loss %.5f score %.2f\n"
                     % (best_bleu+1, self.val_loss[best_bleu],
                        self.val_metric[best_bleu]))
        logger.info("Best loss: %d | val loss %.5f score %.2f",
                    best_loss+1, self.val_loss[best_loss],
                    self.val_metric[best_loss])
        handle.write("Best loss: %d | val loss %.5f score %.2f\n"
                     % (best_loss+1, self.val_loss[best_loss],
                        self.val_metric[best_loss]))
        logger.info("Early stopping marker: wait/patience: %d/%d\n",
                    self.wait, self.patience)
        handle.write("Early stopping marker: wait/patience: %d/%d\n" %
                     (self.wait, self.patience))
        handle.close()

    def extract_references(self, directory, split):
        """
        Get reference descriptions for val or test data.
        """
        references = self.data_generator.get_refs_by_split_as_list(split)

        for refid in xrange(len(references[0])):
            codecs.open('%s/%s_reference.ref%d' % (directory, split, refid),
                        'w', 'utf-8').write('\n'.join([x[refid] for x in references]))
                        #'w', 'utf-8').write('\n'.join(['\n'.join(x) for x in references]))
        return references

    def __bleu_score__(self, directory, val=True):
        '''
        Loss is only weakly correlated with improvements in BLEU,
        and thus improvements in human judgements. Let's also track
        BLEU score of a subset of generated sentences in the val split
        to decide on early stopping, etc.
        '''

        prefix = "val" if val else "test"

        self.extract_references(directory, split=prefix)

        subprocess.check_call(
            ['perl multi-bleu.perl %s/%s_reference.ref < %s/%sGenerated > %s/%sBLEU'
             % (directory, prefix, directory, prefix, directory, prefix)],
            shell=True)
        bleudata = open("%s/%sBLEU" % (directory, prefix)).readline()
        data = bleudata.split(",")[0]
        bleuscore = data.split("=")[1]
        bleu = float(bleuscore.lstrip())
        return bleu

    def multeval_scores(self, directory, val=True):
        '''
        Maybe you want to do early stopping using Meteor, TER, or BLEU?
        '''
        prefix = "val" if val else "test"
        self.extract_references(directory, prefix)

        # First you want re-compound the split German words
        if self.args.meteor_lang == 'de':
            subprocess.check_call(
                ["cp %s/%sGenerated %s/%sGenerated.orig" % (directory, prefix,
                    directory, prefix)], shell=True)
            subprocess.check_call(
                ["sed -i -r 's/ @(.*?)@ //g' %s/%sGenerated" % (directory, prefix)], shell=True)
            subprocess.check_call(
                ["sed -i -r 's/ @(.*?)@ //g' %s/%s_reference.*" % (directory, prefix)], shell=True)

        with cd(MULTEVAL_DIR):
            subprocess.check_call(
                ['./multeval.sh eval --refs ../%s/%s_reference.* \
                 --hyps-baseline ../%s/%sGenerated \
                 --meteor.language %s \
                 --threads 1 \
                 2> %s-multevaloutput 1> %s-multevaloutput'
                % (directory, prefix, directory, prefix,
                    self.args.meteor_lang, self.args.run_string,
                    self.args.run_string)], shell=True)
            handle = open("%s-multevaloutput" % self.args.run_string)
            multdata = handle.readlines()
            handle.close()
            for line in multdata:
              if line.startswith("RESULT: baseline: BLEU: AVG:"):
                mbleu = line.split(":")[4]
                mbleu = mbleu.replace("\n","")
                mbleu = mbleu.strip()
                lr = mbleu.split(".")
                mbleu = float(lr[0]+"."+lr[1][0:2])
              if line.startswith("RESULT: baseline: METEOR: AVG:"):
                mmeteor = line.split(":")[4]
                mmeteor = mmeteor.replace("\n","")
                mmeteor = mmeteor.strip()
                lr = mmeteor.split(".")
                mmeteor = float(lr[0]+"."+lr[1][0:2])
              if line.startswith("RESULT: baseline: TER: AVG:"):
                mter = line.split(":")[4]
                mter = mter.replace("\n","")
                mter = mter.strip()
                lr = mter.split(".")
                mter = float(lr[0]+"."+lr[1][0:2])

            logger.info("Meteor = %.2f | BLEU = %.2f | TER = %.2f", 
			mmeteor, mbleu, mter)

            return mmeteor, mbleu, mter

    def create_checkpoint_directory(self, savetime):
        '''
        We will create one directory to store all of the epochs data inside.
        The name is based on the run_string (if provided) or the current time.
        '''

        prefix = self.args.run_string if self.args.run_string != "" else ""
        number = "%03d" % (len(self.val_metric) + 1)
        filepath = "checkpoints/%s/%s-%s" % ((prefix, number, savetime))
        try:
            os.mkdir("checkpoints/%s/" % (prefix))
            shutil.copyfile("train.py", "checkpoints/%s/train.py" % prefix)
            shutil.copyfile("models.py", "checkpoints/%s/models.py" % prefix)
        except OSError:
            pass  # directory already exists
        try:
            os.mkdir(filepath)
        except OSError:
            pass  # directory already exists
        logger.info("\nIn %s ...",filepath)
        return filepath

    def save_run_arguments(self, filepath):
        '''
        Save the command-line arguments, along with the method defaults,
        used to parameterise this run.
        '''
        handle = open("%s/argparse.args" % filepath, "w")
        for arg, value in self.args.__dict__.iteritems():
            handle.write("%s: %s\n" % (arg, str(value)))
        handle.close()

    def checkpoint_parameters(self, epoch, logs, filepath, cur_val_metric,
                              cur_val_loss=0.):
        '''
        We checkpoint the model parameters based on either PPLX reduction or
        metric score increase in the validation data. This is driven by the
        user-specified argument self.args.stopping_loss.

	TODO: this doesn't yet support early stopping based on TER
        '''

        weights_path = "%s/weights.hdf5" % filepath

        self.val_loss.append(cur_val_loss)
        if cur_val_loss < self.best_val_loss:
            self.best_val_loss = cur_val_loss

        # save the weights anyway for debug purposes
        self.model.save_weights(weights_path, overwrite=True)

        # update the best values, if applicable
        self.val_metric.append(cur_val_metric)
        if cur_val_metric > self.best_val_metric:
            self.best_val_metric = cur_val_metric

        optimiser_params = open("%s/optimiser_params" % filepath, "w")
        for key, value in self.model.optimizer.get_config().items():
            optimiser_params.write("%s: %s\n" % (key, value))
        optimiser_params.close()

    def reset_text_arrays(self, text_arrays, fixed_words=1):
        """ Reset the values in the text data structure to zero so we cannot
        accidentally pass them into the model """
        reset_arrays = deepcopy(text_arrays)
        reset_arrays[:,fixed_words:, :] = 0
        return reset_arrays

    def generate_sentences(self, filepath, val=True):
        """
        Generates descriptions of images for --generation_timesteps
        iterations through the LSTM. Each input description is clipped to
        the first <BOS> token, or, if --generate_from_N_words is set, to the
        first N following words (N + 1 BOS token).
        This process can be additionally conditioned
        on source language hidden representations, if provided by the
        --source_vectors parameter.
        The output is clipped to the first EOS generated, if it exists.

        TODO: duplicated method with generate.py
        """
        prefix = "val" if val else "test"
        logger.info("Generating %s descriptions", prefix)
        start_gen = self.args.generate_from_N_words + 1  # include BOS
        handle = codecs.open("%s/%sGenerated" % (filepath, prefix), 
                             "w", 'utf-8')

        val_generator = self.data_generator.generation_generator(prefix,
                                                                 in_callbacks=True)
        seen = 0
        for data in val_generator:
            inputs = data[0]
            text = deepcopy(inputs['text'])
            # Append the first start_gen words to the complete_sentences list
            # for each instance in the batch.
            complete_sentences = [[] for _ in range(text.shape[0])]
            for t in range(start_gen):  # minimum 1
                for i in range(text.shape[0]):
                    w = np.argmax(text[i, t])
                    complete_sentences[i].append(self.index2word[w])
            del inputs['text']
            text = self.reset_text_arrays(text, start_gen)
            Y_target = data[1]
            inputs['text'] = text

            for t in range(start_gen, self.args.generation_timesteps):
                logger.debug("Input token: %s" % self.index2word[np.argmax(inputs['text'][0,t-1])])
                preds = self.model.predict(inputs, verbose=0)

                # Look at the last indices for the words.
                #next_word_indices = np.argmax(preds['output'][:, t-1], axis=1)
                next_word_indices = np.argmax(preds[:, t-1], axis=1)
                logger.debug("Predicted token: %s" % self.index2word[next_word_indices[0]])
                # update array[0]/sentence-so-far with generated words.
                for i in range(len(next_word_indices)):
                    inputs['text'][i, t, next_word_indices[i]] = 1.
                next_words = [self.index2word[x] for x in next_word_indices]
                for i in range(len(next_words)):
                    complete_sentences[i].append(next_words[i])

            sys.stdout.flush()
            # print/extract each sentence until it hits the first end-of-string token
            for s in complete_sentences:
                decoded_str = ' '.join([x for x
                                        in itertools.takewhile(
                                            lambda n: n != "<E>", s[1:])])
                handle.write(decoded_str + "\n")

            seen += text.shape[0]
            if seen >= self.data_generator.split_sizes['val']:
                # Hacky way to break out of the generator
                break
        handle.close()