Jigsaw Unintended Bias in Toxicity Classification 完整代码

转载
mob60475703f08d 2021-08-17 20:26:00
from collections import Counter
from contextlib import contextmanager
import copy
from functools import partial
from itertools import chain
from multiprocessing import Pool
import os
import random
import re
import string
import time
import warnings

import joblib
import numpy as np
import pandas as pd

from nltk.stem import PorterStemmer, SnowballStemmer
from nltk.stem.lancaster import LancasterStemmer

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.utils import shuffle

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import torch
import torch.nn as nn
from torch.utils.data import Dataset, Sampler, DataLoader
from torch.optim.optimizer import Optimizer

EMBEDDING_FASTTEXT = '../input/fasttext-crawl-300d-2m/crawl-300d-2M.vec'
TRAIN_DATA = '../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv'
TEST_DATA = '../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv'
SAMPLE_SUBMISSION = '../input/jigsaw-unintended-bias-in-toxicity-classification/sample_submission.csv'

embed_size = 300
max_features = 100000
max_len = 220

batch_size = 512
train_epochs = 6
n_splits = 5

mu = 0.9
updates_per_epoch = 10

seed = 1029
device = torch.device('cuda:0')

ps = PorterStemmer()
lc = LancasterStemmer()
sb = SnowballStemmer('english')

@contextmanager
def timer(msg):
    t0 = time.time()
    print(f'[{msg}] start.')
    yield
    elapsed_time = time.time() - t0
    print(f'[{msg}] done in {elapsed_time / 60:.2f} min.')


def seed_torch(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

misspell_dict = {"aren't": "are not", "can't": "cannot", "couldn't": "could not",
                 "didn't": "did not", "doesn't": "does not", "don't": "do not",
                 "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
                 "he'd": "he would", "he'll": "he will", "he's": "he is",
                 "i'd": "I had", "i'll": "I will", "i'm": "I am", "isn't": "is not",
                 "it's": "it is", "it'll": "it will", "i've": "I have", "let's": "let us",
                 "mightn't": "might not", "mustn't": "must not", "shan't": "shall not",
                 "she'd": "she would", "she'll": "she will", "she's": "she is",
                 "shouldn't": "should not", "that's": "that is", "there's": "there is",
                 "they'd": "they would", "they'll": "they will", "they're": "they are",
                 "they've": "they have", "we'd": "we would", "we're": "we are",
                 "weren't": "were not", "we've": "we have", "what'll": "what will",
                 "what're": "what are", "what's": "what is", "what've": "what have",
                 "where's": "where is", "who'd": "who would", "who'll": "who will",
                 "who're": "who are", "who's": "who is", "who've": "who have",
                 "won't": "will not", "wouldn't": "would not", "you'd": "you would",
                 "you'll": "you will", "you're": "you are", "you've": "you have",
                 "'re": " are", "wasn't": "was not", "we'll": " will", "tryin'": "trying"}


def _get_misspell(misspell_dict):
    misspell_re = re.compile('(%s)' % '|'.join(misspell_dict.keys()))
    return misspell_dict, misspell_re


def replace_typical_misspell(text):
    misspellings, misspellings_re = _get_misspell(misspell_dict)

    def replace(match):
        return misspellings[match.group(0)]

    return misspellings_re.sub(replace, text)


puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']',
          '>', '%', '=', '#', '*', '+', '\\', '•', '~', '@', '£', '·', '_', '{', '}', '©', '^',
          '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 'Â', '█',
          '½', 'à', '…', '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶',
          '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼',
          '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
          'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»', '，', '♪',
          '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√']


def clean_text(x):
    x = str(x)
    for punct in puncts + list(string.punctuation):
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x


def clean_numbers(x):
    return re.sub(r'\d+', ' ', x)

def load_embedding(embedding_path, word_index):

    def get_coefs(word, *arr):
        return word, np.asarray(arr, dtype='float32')

    embeddings_index = dict(get_coefs(*o.strip().split(' ')) for o in open(embedding_path))

    # word_index = tokenizer.word_index
    nb_words = min(max_features + 2, len(word_index))
    embedding_matrix = np.zeros((nb_words, embed_size))

    for key, i in word_index.items():
        word = key
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.lower()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.upper()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = key.capitalize()
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = ps.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = lc.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue
        word = sb.stem(key)
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            continue

    return embedding_matrix

def load_and_prec():
    train = pd.read_csv(TRAIN_DATA, index_col='id')
    test = pd.read_csv(TEST_DATA, index_col='id')

    # lower
    train['comment_text'] = train['comment_text'].str.lower()
    test['comment_text'] = test['comment_text'].str.lower()

    # clean misspellings
    train['comment_text'] = train['comment_text'].apply(replace_typical_misspell)
    test['comment_text'] = test['comment_text'].apply(replace_typical_misspell)

    # clean the text
    train['comment_text'] = train['comment_text'].apply(clean_text)
    test['comment_text'] = test['comment_text'].apply(clean_text)

    # clean numbers
    train['comment_text'] = train['comment_text'].apply(clean_numbers)
    test['comment_text'] = test['comment_text'].apply(clean_numbers)

    # strip
    train['comment_text'] = train['comment_text'].str.strip()
    test['comment_text'] = test['comment_text'].str.strip()

    # replace blank with nan
    train['comment_text'].replace('', np.nan, inplace=True)
    test['comment_text'].replace('', np.nan, inplace=True)

    # nan prediction
    nan_pred = train['target'][train['comment_text'].isna()].mean()

    # fill up the missing values
    train_x = train['comment_text'].fillna('_##_').values
    test_x = test['comment_text'].fillna('_##_').values

    # get the target values
    identity_columns = [
        'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
        'muslim', 'black', 'white', 'psychiatric_or_mental_illness']

    weights = np.ones((len(train),))
    weights += train[identity_columns].fillna(0).values.sum(axis=1) * 3
    weights += train['target'].values * 8
    weights /= weights.max()
    train_y = np.vstack([train['target'].values, weights]).T

    train_y_identity = train[identity_columns].values

    # shuffling the data
    np.random.seed(seed)
    train_idx = np.random.permutation(len(train_x))

    train_x = train_x[train_idx]
    train_y = train_y[train_idx]
    train_y_identity = train_y_identity[train_idx]

    return train_x, train_y, train_y_identity, test_x, nan_pred

def build_vocab(texts, max_features):
    counter = Counter()
    for text in texts:
        counter.update(text.split())

    vocab = {
        'token2id': {'<PAD>': 0, '<UNK>': max_features + 1},
        'id2token': {}
    }
    vocab['token2id'].update(
        {token: _id + 1 for _id, (token, count) in
         enumerate(counter.most_common(max_features))})
    vocab['id2token'] = {v: k for k, v in vocab['token2id'].items()}
    return vocab


def tokenize(texts, vocab):

    def text2ids(text, token2id):
        return [
            token2id.get(token, len(token2id) - 1)
            for token in text.split()[:max_len]]

    return [
        text2ids(text, vocab['token2id'])
        for text in texts]

class TextDataset(Dataset):

    def __init__(self, seqs, targets=None, maxlen=200):
        if targets is not None:
            self.targets = targets
        else:
            self.targets = np.random.randint(2, size=(len(seqs),))

        self.seqs = seqs
        self.maxlen = maxlen

    def __len__(self):
        return len(self.seqs)

    def get_keys(self):
        lens = np.fromiter(
            ((min(self.maxlen, len(seq))) for seq in self.seqs),
            dtype=np.int32)
        return lens

    def __getitem__(self, index):
        return index, self.seqs[index], self.targets[index]


def collate_fn(data):

    def _pad_sequences(seqs):
        lens = [len(seq) for seq in seqs]
        max_len = max(lens)

        padded_seqs = torch.zeros(len(seqs), max_len).long()
        for i, seq in enumerate(seqs):
            start = max_len - lens[i]
            padded_seqs[i, start:] = torch.LongTensor(seq)
        return padded_seqs

    index, seqs, targets = zip(*data)
    seqs = _pad_sequences(seqs)
    return index, seqs, torch.FloatTensor(targets)


class BucketSampler(Sampler):

    def __init__(self, data_source, sort_keys, bucket_size=None, batch_size=1048, shuffle_data=True):
        super().__init__(data_source)
        self.shuffle = shuffle_data
        self.batch_size = batch_size
        self.sort_keys = sort_keys
        self.bucket_size = bucket_size if bucket_size is not None else len(sort_keys)
        self.weights = None

        if not shuffle_data:
            self.index = self.prepare_buckets()
        else:
            self.index = None

    def set_weights(self, weights):
        assert weights >= 0
        total = np.sum(weights)
        if total != 1:
            weights = weights / total
        self.weights = weights

    def __iter__(self):
        indices = None
        if self.weights is not None:
            total = len(self.sort_keys)
            indices = np.random.choice(total, (total,), p=self.weights)
        if self.shuffle:
            self.index = self.prepare_buckets(indices)
        return iter(self.index)

    def get_reverse_indexes(self):
        indexes = np.zeros((len(self.index),), dtype=np.int32)
        for i, j in enumerate(self.index):
            indexes[j] = i
        return indexes

    def __len__(self):
        return len(self.sort_keys)

    def prepare_buckets(self, indices=None):
        lens = - self.sort_keys
        assert self.bucket_size % self.batch_size == 0 or self.bucket_size == len(lens)

        if indices is None:
            if self.shuffle:
                indices = shuffle(np.arange(len(lens), dtype=np.int32))
                lens = lens[indices]
            else:
                indices = np.arange(len(lens), dtype=np.int32)

        #  bucket iterator
        def divide_chunks(l, n):
            if n == len(l):
                yield np.arange(len(l), dtype=np.int32), l
            else:
                # looping till length l
                for i in range(0, len(l), n):
                    data = l[i:i + n]
                    yield np.arange(i, i + len(data), dtype=np.int32), data

        new_indices = []
        extra_batch = None
        for chunk_index, chunk in divide_chunks(lens, self.bucket_size):
            # sort indices in bucket by descending order of length
            indices_sorted = chunk_index[np.argsort(chunk, axis=-1)]
            batches = []
            for _, batch in divide_chunks(indices_sorted, self.batch_size):
                if len(batch) == self.batch_size:
                    batches.append(batch.tolist())
                else:
                    assert extra_batch is None
                    assert batch is not None
                    extra_batch = batch

            # shuffling batches within buckets
            if self.shuffle:
                batches = shuffle(batches)
            for batch in batches:
                new_indices.extend(batch)

        if extra_batch is not None:
            new_indices.extend(extra_batch)
        return indices[new_indices]

class NeuralNet(nn.Module):

    def __init__(self, embedding_matrix):
        super(NeuralNet, self).__init__()

        lstm_hidden_size = 120
        gru_hidden_size = 60
        self.gru_hidden_size = gru_hidden_size

        self.embedding = nn.Embedding(*embedding_matrix.shape)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.embedding_dropout = nn.Dropout2d(0.2)

        self.lstm = nn.LSTM(embedding_matrix.shape[1], lstm_hidden_size, bidirectional=True, batch_first=True)
        self.gru = nn.GRU(lstm_hidden_size * 2, gru_hidden_size, bidirectional=True, batch_first=True)

        self.linear = nn.Linear(gru_hidden_size * 6, 20)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.out = nn.Linear(20, 1)

    def apply_spatial_dropout(self, h_embedding):
        h_embedding = h_embedding.transpose(1, 2).unsqueeze(2)
        h_embedding = self.embedding_dropout(h_embedding).squeeze(2).transpose(1, 2)
        return h_embedding

    def forward(self, x):
        h_embedding = self.embedding(x)
        h_embedding = self.apply_spatial_dropout(h_embedding)

        h_lstm, _ = self.lstm(h_embedding)
        h_gru, hh_gru = self.gru(h_lstm)

        hh_gru = hh_gru.view(-1, self.gru_hidden_size * 2)

        avg_pool = torch.mean(h_gru, 1)
        max_pool, _ = torch.max(h_gru, 1)

        conc = torch.cat((hh_gru, avg_pool, max_pool), 1)
        conc = self.relu(self.linear(conc))
        conc = self.dropout(conc)
        out = self.out(conc)

        return out

class EMA:

    def __init__(self, model, mu, level='batch', n=1):
        # self.ema_model = copy.deepcopy(model)
        self.mu = mu
        self.level = level
        self.n = n
        self.cnt = self.n
        self.shadow = {}
        for name, param in model.named_parameters():
            if param.requires_grad:
                self.shadow[name] = param.data

    def _update(self, model):
        for name, param in model.named_parameters():
            if param.requires_grad:
                new_average = (1 - self.mu) * param.data + self.mu * self.shadow[name]
                self.shadow[name] = new_average.clone()

    def set_weights(self, ema_model):
        for name, param in ema_model.named_parameters():
            if param.requires_grad:
                param.data = self.shadow[name]

    def on_batch_end(self, model):
        if self.level is 'batch':
            self.cnt -= 1
            if self.cnt == 0:
                self._update(model)
                self.cnt = self.n

    def on_epoch_end(self, model):
        if self.level is 'epoch':
            self._update(model)

class ParamScheduler:

    def __init__(self, optimizer, scale_fn, step_size):
        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))

        self.optimizer = optimizer
        self.scale_fn = scale_fn
        self.step_size = step_size
        self.last_batch_iteration = 0

    def batch_step(self):
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = self.scale_fn(self.last_batch_iteration / self.step_size)

        self.last_batch_iteration += 1


def combine_scale_functions(scale_fns, phases=None):
    if phases is None:
        phases = [1. / len(scale_fns)] * len(scale_fns)
    phases = [phase / sum(phases) for phase in phases]
    phases = torch.tensor([0] + phases)
    phases = torch.cumsum(phases, 0)

    def _inner(x):
        idx = (x >= phases).nonzero().max()
        actual_x = (x - phases[idx]) / (phases[idx + 1] - phases[idx])
        return scale_fns[idx](actual_x)

    return _inner


def scale_cos(start, end, x):
    return start + (1 + np.cos(np.pi * (1 - x))) * (end - start) / 2

class JigsawEvaluator:

    def __init__(self, y_binary, y_identity_binary, power=-5, overall_model_weight=0.25):
        self.y = y_binary
        self.y_i = y_identity_binary
        self.n_subgroups = self.y_i.shape[1]
        self.power = power
        self.overall_model_weight = overall_model_weight

    @staticmethod
    def _compute_auc(y_true, y_pred):
        try:
            return roc_auc_score(y_true, y_pred)
        except ValueError:
            return np.nan

    def _compute_subgroup_auc(self, i, y_pred):
        mask = self.y_i[:, i] == 1
        return self._compute_auc(self.y[mask], y_pred[mask])

    def _compute_bpsn_auc(self, i, y_pred):
        mask = self.y_i[:, i] + self.y == 1
        return self._compute_auc(self.y[mask], y_pred[mask])

    def _compute_bnsp_auc(self, i, y_pred):
        mask = self.y_i[:, i] + self.y != 1
        return self._compute_auc(self.y[mask], y_pred[mask])

    def compute_bias_metrics_for_model(self, y_pred):
        records = np.zeros((3, self.n_subgroups))
        for i in range(self.n_subgroups):
            records[0, i] = self._compute_subgroup_auc(i, y_pred)
            records[1, i] = self._compute_bpsn_auc(i, y_pred)
            records[2, i] = self._compute_bnsp_auc(i, y_pred)
        return records

    def _calculate_overall_auc(self, y_pred):
        return roc_auc_score(self.y, y_pred)

    def _power_mean(self, array):
        total = sum(np.power(array, self.power))
        return np.power(total / len(array), 1 / self.power)

    def get_final_metric(self, y_pred):
        bias_metrics = self.compute_bias_metrics_for_model(y_pred)
        bias_score = np.average([
            self._power_mean(bias_metrics[0]),
            self._power_mean(bias_metrics[1]),
            self._power_mean(bias_metrics[2])
        ])
        overall_score = self.overall_model_weight * self._calculate_overall_auc(y_pred)
        bias_score = (1 - self.overall_model_weight) * bias_score
        return overall_score + bias_score


def sigmoid(x):
    return 1 / (1 + np.exp(-x))


def eval_model(model, data_loader):
    model.eval（)
    preds_fold = np.zeros(len(data_loader.dataset))

    with torch.no_grad():
        for index, x_batch, y_batch in data_loader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)
            y_pred = model(x_batch).detach()
            preds_fold[list(index)] = sigmoid(y_pred.cpu().numpy())[:, 0]

    return preds_fold

warnings.filterwarnings('ignore')
seed_torch(seed)

with timer('load data'):
    train_x, train_y, train_y_identity, test_x, nan_pred = load_and_prec()
    train_nan_mask = train_x == '_##_'
    test_nan_mask = test_x == '_##_'
    y_binary = (train_y[:, 0] >= 0.5).astype(int)
    y_identity_binary = (train_y_identity >= 0.5).astype(int)
    vocab = build_vocab(chain(train_x, test_x), max_features)
    embedding_matrix = load_embedding(EMBEDDING_FASTTEXT, vocab['token2id'])

    train_x = np.array(tokenize(train_x, vocab))
    test_x = np.array(tokenize(test_x, vocab))

with timer('pseudo label'):
    train_preds = np.zeros((len(train_x)))
    test_preds = np.zeros((len(test_x)))

    ema_train_preds = np.zeros((len(train_x)))
    ema_test_preds = np.zeros((len(test_x)))

    train_dataset = TextDataset(train_x, targets=train_y, maxlen=max_len)
    test_dataset = TextDataset(test_x, maxlen=max_len)

    train_sampler = BucketSampler(train_dataset, train_dataset.get_keys(),
                                  bucket_size=batch_size * 20, batch_size=batch_size)
    test_sampler = BucketSampler(test_dataset, test_dataset.get_keys(),
                                 batch_size=batch_size, shuffle_data=False)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False,
                              sampler=train_sampler, num_workers=0, collate_fn=collate_fn)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, sampler=test_sampler,
                             shuffle=False, num_workers=0, collate_fn=collate_fn)

    models = {}
    model = NeuralNet(embedding_matrix).to(device)

    ema_model = copy.deepcopy(model)
    ema_model.eval（)

    ema_n = int(len(train_loader.dataset) / (updates_per_epoch * batch_size))
    ema = EMA(model, mu, n=ema_n)

    scale_fn = combine_scale_functions(
        [partial(scale_cos, 1e-4, 5e-3), partial(scale_cos, 5e-3, 1e-3)], [0.2, 0.8])

    optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
    scheduler = ParamScheduler(optimizer, scale_fn, train_epochs * len(train_loader))

    all_test_preds = []

    for epoch in range(train_epochs):
        start_time = time.time()
        model.train()

        for _, x_batch, y_batch in train_loader:
            x_batch = x_batch.to(device)
            y_batch = y_batch.to(device)

            scheduler.batch_step()
            y_pred = model(x_batch)

            loss = nn.BCEWithLogitsLoss(weight=y_batch[:, 1])(y_pred[:, 0], y_batch[:, 0])
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            ema.on_batch_end(model)

        elapsed_time = time.time() - start_time
        print('Epoch {}/{} \t time={:.2f}s'.format(
            epoch + 1, train_epochs, elapsed_time))

        test_preds = eval_model(model, test_loader)
        all_test_preds.append(test_preds)

        ema.on_epoch_end(model)

    ema.set_weights(ema_model)
    ema_model.lstm.flatten_parameters()
    ema_model.gru.flatten_parameters()

    checkpoint_weights = np.array([2 ** epoch for epoch in range(train_epochs)])
    checkpoint_weights = checkpoint_weights / checkpoint_weights.sum()

    ema_test_y = eval_model(ema_model, test_loader)
    test_y = np.average(all_test_preds, weights=checkpoint_weights, axis=0)
    test_y = np.mean([test_y, ema_test_y], axis=0)
    test_y[test_nan_mask] = nan_pred
    weight = np.ones((len(test_y)))
    test_y = np.vstack((test_y, weight)).T

    models['model'] = model.state_dict()
    models['ema_model'] = ema_model.state_dict()

with timer('train'):
    splits = list(
        StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed).split(train_x, y_binary))
    splits_test = list(KFold(n_splits=n_splits, shuffle=True, random_state=seed).split(test_x))

    for fold, ((train_idx, valid_idx), (train_idx_test, _)) in enumerate(zip(splits, splits_test)):
        print(f'Fold {fold + 1}')

        x_train_fold = np.concatenate((train_x[train_idx], test_x[train_idx_test]), axis=0)
        y_train_fold = np.concatenate((train_y[train_idx], test_y[train_idx_test]), axis=0)

        x_valid_fold = train_x[valid_idx]
        y_valid_fold = train_y[valid_idx]

        valid_nan_mask = train_nan_mask[valid_idx]

        y_valid_fold_binary = y_binary[valid_idx]
        y_valid_fold_identity_binary = y_identity_binary[valid_idx]
        evaluator = JigsawEvaluator(y_valid_fold_binary, y_valid_fold_identity_binary)

        train_dataset = TextDataset(x_train_fold, targets=y_train_fold, maxlen=max_len)
        valid_dataset = TextDataset(x_valid_fold, targets=y_valid_fold, maxlen=max_len)

        train_sampler = BucketSampler(train_dataset, train_dataset.get_keys(),
                                      bucket_size=batch_size * 20, batch_size=batch_size)
        valid_sampler = BucketSampler(valid_dataset, valid_dataset.get_keys(),
                                      batch_size=batch_size, shuffle_data=False)

        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False,
                                  sampler=train_sampler, num_workers=0, collate_fn=collate_fn)
        valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False,
                                  sampler=valid_sampler, collate_fn=collate_fn)

        model = NeuralNet(embedding_matrix).to(device)

        ema_model = copy.deepcopy(model)
        ema_model.eval（)

        ema_n = int(len(train_loader.dataset) / (updates_per_epoch * batch_size))
        ema = EMA(model, mu, n=ema_n)

        scale_fn = combine_scale_functions(
            [partial(scale_cos, 1e-4, 5e-3), partial(scale_cos, 5e-3, 1e-3)], [0.2, 0.8])

        optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
        scheduler = ParamScheduler(optimizer, scale_fn, train_epochs * len(train_loader))

        all_valid_preds = []
        all_test_preds = []

        for epoch in range(train_epochs):
            start_time = time.time()
            model.train()

            for _, x_batch, y_batch in train_loader:
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)

                scheduler.batch_step()
                y_pred = model(x_batch)

                loss = nn.BCEWithLogitsLoss(weight=y_batch[:, 1])(y_pred[:, 0], y_batch[:, 0])
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                ema.on_batch_end(model)

            valid_preds = eval_model(model, valid_loader)
            valid_preds[valid_nan_mask] = nan_pred
            all_valid_preds.append(valid_preds)

            auc_score = evaluator.get_final_metric(valid_preds)
            elapsed_time = time.time() - start_time
            print('Epoch {}/{} \t auc={:.5f} \t time={:.2f}s'.format(
                epoch + 1, train_epochs, auc_score, elapsed_time))

            test_preds = eval_model(model, test_loader)
            all_test_preds.append(test_preds)

            models[f'model_{fold}{epoch}'] = model.state_dict()

            ema.on_epoch_end(model)

        ema.set_weights(ema_model)
        ema_model.lstm.flatten_parameters()
        ema_model.gru.flatten_parameters()

        models[f'ema_model_{fold}'] = ema_model.state_dict()

        checkpoint_weights = np.array([2 ** epoch for epoch in range(train_epochs)])
        checkpoint_weights = checkpoint_weights / checkpoint_weights.sum()

        valid_preds_fold = np.average(all_valid_preds, weights=checkpoint_weights, axis=0)
        valid_preds_fold[valid_nan_mask] = nan_pred
        auc_score = evaluator.get_final_metric(valid_preds)
        print(f'cv model \t auc={auc_score:.5f}')

        ema_valid_preds_fold = eval_model(ema_model, valid_loader)
        ema_valid_preds_fold[valid_nan_mask] = nan_pred
        auc_score = evaluator.get_final_metric(ema_valid_preds_fold)
        print(f'EMA model \t auc={auc_score:.5f}')

        train_preds[valid_idx] = valid_preds_fold
        ema_train_preds[valid_idx] = ema_valid_preds_fold

        test_preds_fold = np.average(all_test_preds, weights=checkpoint_weights, axis=0)
        ema_test_preds_fold = eval_model(ema_model, test_loader)

        test_preds += test_preds_fold / n_splits
        ema_test_preds += ema_test_preds_fold / n_splits

torch.save(models, 'model.pt')
test_preds[test_nan_mask] = nan_pred
ema_test_preds[test_nan_mask] = nan_pred
evaluator = JigsawEvaluator(y_binary, y_identity_binary)
auc_score = evaluator.get_final_metric(train_preds)
ema_auc_score = evaluator.get_final_metric(ema_train_preds)
print(f'cv score: {auc_score:<8.5f}')
print(f'EMA cv score: {ema_auc_score:<8.5f}')

train_preds = np.mean([train_preds, ema_train_preds], axis=0)
test_preds = np.mean([test_preds, ema_test_preds], axis=0)
auc_score = evaluator.get_final_metric(train_preds)
print(f'final prediction score: {auc_score:<8.5f}')

submission = pd.read_csv(SAMPLE_SUBMISSION, index_col='id')
submission['prediction'] = test_preds * 0.9 + test_y[:, 0] * 0.1
submission.reset_index(drop=False, inplace=True)
submission.to_csv('submission.csv', index=False)
submission.head()