import torch
import numpy as np


import torch.nn as nn



class MaxState(torch.nn.Module):
    def __init__(self, hidden_dim, heads, win):
        super(MaxState, self).__init__()

        assert hidden_dim % heads == 0, "Hidden size must be divisible by the number of heads."

        self.head_size = hidden_dim // heads
        self.head0 = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.head1 = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)
        self.head2 = torch.nn.Linear(hidden_dim, hidden_dim, bias=False)
        # self.h_linear=torch.nn.Parameter(torch.empty(1, 1))
        # torch.nn.init.xavier_uniform_(self.h_linear,0.5)
        # self.layer_nor = torch.nn.LayerNorm(hidden_dim)
        # self.norm = torch.nn.LayerNorm(hidden_dim)
        # self.alpha = torch.nn.Parameter(torch.tensor(0.5))

        self.head_num = heads

        self.hidden = hidden_dim

    def forward(self, input_data, state=None):
        # self.head.to(device)
        b, s, k, h = input_data.shape[0], input_data.shape[1], self.head_num, self.head_size

        out = self.head0(input_data)

        out1 = self.head1(input_data)

        out2 = self.head2(input_data)

        #
        out = out.reshape([b, s, k, h]).permute([0, 2, 1, 3])
        out1 = out1.reshape([b, s, k, h]).permute([0, 2, 1, 3])
        # out2 = out2.reshape([b, s, k, h]).permute([0, 2, 1, 3])
        # out1 = self.head1(input_data).reshape([b, s, k, h]).permute([0, 2, 1, 3])

        out = torch.cummax((out + out1) / h ** 0.5, 2)[0]
        # out = torch.cummin((out + out1)/k**0.5 , 2)[0]
        # out_sum = torch.cumsum((out + out1)/k**0.5 , 2)
        # out=(out-out_min)*out

        out = out.permute([0, 2, 1, 3])
        out1 = out1.permute([0, 2, 1, 3])
        # out2 = out2.permute([0, 2, 1, 3])
        out = out.reshape([b, s, -1])
        out1 = out1.reshape([b, s, -1])
        # out2 = out2.reshape([b, s, -1])
        # out = self.layer_nor(out)

        # out = (out + out2) * out+out1

        # out3=torch.cummax(out,1)[0]
        out = (out + out2) * out + out1

        # out = self.alpha * out * (out + out2) + (1 - self.alpha) * out1

        return out, state




class FeedForward(torch.nn.Module):
    def __init__(self, hidden_size):
        super(FeedForward, self).__init__()

        self.ffn1 = torch.nn.Linear(hidden_size, hidden_size * 2)
        self.ffn2 = torch.nn.Linear(hidden_size * 2, hidden_size)
        self.gate = torch.nn.Linear(hidden_size, hidden_size * 2)
        # self.h_linear=torch.nn.Parameter(torch.empty(1, 1))
        # self.gate  = torch.nn.Parameter(torch.empty(hidden_size,  hidden_size * 2))
        # torch.nn.init.xavier_uniform_(self.gate,0.5)
        self.relu = torch.nn.ReLU()

    def forward(self, x):
        x1 = self.ffn1(x)
        x2 = self.relu(self.gate(x))
        xx = x1 * x2
        x = self.ffn2(xx)
        return x


class DecoderLayer(torch.nn.Module):
    def __init__(self, hidden_size, num_heads):
        super(DecoderLayer, self).__init__()
        # self.self_attention = MaskMultiHeadAttention(hidden_size, num_heads)
        self.self_attention = MaxState(hidden_size, num_heads, 8)
        # self.self_attention = KAttention(hidden_size, num_heads)
        self.ffn = FeedForward(hidden_size)
        self.layer_norm = torch.nn.LayerNorm(hidden_size)
        # self.norm = L2Norm()

        # self.layer_nor = torch.nn.LayerNorm(hidden_dim)
        # self.norm = torch.nn.LayerNorm(hidden_dim)
        self.alpha = torch.nn.Parameter(torch.tensor(0.5))

        # ha = self.norm(self.attention(h))
        # # 更新输入,包括缩放后的注意力输出
        # h = self.norm(h + self.attention_scale * (ha - h))
        # # 对更新后的输入进行多层感知机层的处理并归一化
        # hm = self.norm(self.mlp(h))
        # # 最终更新输入,包括缩放后的多层感知机输出
        # h = self.norm(h + self.mlp_scale * (hm - h))
        # 返回处理后的结果

    def forward(self, x, state=None, seq_len=None):
        x1, state = self.self_attention(x, state)
        x = self.layer_norm(self.alpha*self.ffn(x1) + (1-self.alpha)*x)

        return x, state


class SamOut(torch.nn.Module):
    def __init__(self, voc_size, hidden_size, num_heads, num_layers):
        super(SamOut, self).__init__()
        self.em = torch.nn.Embedding(voc_size, hidden_size, padding_idx=0)
        self.pos = torch.nn.Embedding(1024, hidden_size)

        self.decoder_layers = torch.nn.ModuleList([DecoderLayer(hidden_size, num_heads) for _ in range(num_layers)])
        self.head = torch.nn.Linear(hidden_size, voc_size, False)
        # self.head_state = torch.nn.Linear(hidden_size, num_layers, False)

        self.down = torch.nn.ModuleList(
            [torch.nn.Linear(2 * hidden_size, hidden_size, False) for _ in range(num_layers)])

        # self.down = torch.nn.Linear(2 * hidden_size, hidden_size, False)


    def state_forward(self, state, pos, x):
        if state is None:
            state = [None] * len(self.decoder_layers)
        i = 0
        for ii, decoder_layer in enumerate(self.decoder_layers):
            x = self.down[i](torch.concat([torch.zeros([x.shape[0], 1, 1]).to(device) + pos, x], -1))
            # x = self.down(torch.concat([torch.zeros([x.shape[0], 1, 1]).to(device) + pos, x], -1))

            x1, state[i] = decoder_layer(x, state[i])
            x = x1 + x
            i += 1
        return x, state
  

    def pos_forward(self, x):
        if x.shape[1] >= 1024:
            pos = self.pos(torch.arange(0, x.shape[1]).long().to(device) // 1024).unsqueeze(0)
            pos = self.pos(torch.arange(0, x.shape[1]).long().to(device) % 1024).unsqueeze(0) + pos

        else:
            pos = self.pos(torch.arange(0, x.shape[1]).long().to(device)).unsqueeze(0)
        return pos

    def forward(self, x0):
        x0, _ = self.one_forward(x0, state=None)

        return x0, _

    def one_forward(self, x, state=None, seq_len=None):
        x = self.em(x)

        pos = self.pos_forward(x)

        x, state = self.state_forward(state, pos, x)

        return self.head(x), state


device = "cuda"
if __name__ == '__main__':
    net = SamOut(235, 256, 16, 4)
    net.to(device)
    net(torch.randint(0, 200, [2, 8 * 13]).to(device))
    #

该代码定义了一个基于PyTorch的神经网络模型,用于处理序列数据。以下是代码的主要组成部分:

  1. MaxState类:一个自定义的神经网络层,它包含三个线性层(head0head1head2),这些层对输入数据进行处理,并通过累积最大操作进行状态更新。
  2. FeedForward类:一个前馈网络,包含两个线性层和一个ReLU激活函数,用于对输入数据进行非线性变换。
  3. DecoderLayer类:解码器层,结合了多头注意力(此处使用MaxState或KAttention)和前馈网络,通过层归一化处理输入数据。
  4. SamOut类:整个模型的主体,包含词嵌入层、位置编码、多个解码器层和一个输出层。该类还定义了状态前向传播方法,用于处理序列数据。
  5. 设备配置:代码最后将模型迁移到CUDA设备上以进行GPU加速。
  6. 模型测试:在主函数中,创建了一个SamOut实例,并使用随机整数张量作为输入进行了一次前向传播,以检查模型是否能正常运行。
    整体而言,这个模型似乎是为了处理序列到序列的任务(如机器翻译或文本生成),其中使用了多头注意力和前馈网络来捕捉序列数据中的复杂关系。