利用 AssemblyAI 在 PyTorch 中建立端到端的语音识别模型( 三 )


利用 AssemblyAI 在 PyTorch 中建立端到端的语音识别模型
本文插图
递归神经网络(RNN)擅长处理序列建模问题 。 RNN会逐步处理音频特征 , 在使用前一帧的上下文的同时对每一帧进行预测 。 我们使用BiRNN是因为我们不仅需要每个步骤之前框架的上下文 , 还希望得到它之后框架的上下文 。
这可以帮助模型做出更好的预测 , 因为音频中的每一帧在进行预测之前都会有更多信息 。 我们使用RNN的门控递归单元(GRU)变种 , 因为它比LSTM需要的的计算资源更少 , 并且在某些情况下工作效果也一样 。
该模型为输出字符的概率矩阵 , 我们将使用该矩阵将其输入到解码器中 , 提取模型认为是概率最高的字符 。
class CNNLayerNorm(nn.Module):''''''Layer normalization built for cnns input''''''def __init__(self, n_feats):super(CNNLayerNorm, self).__init__self.layer_norm = nn.LayerNorm(n_feats)def forward(self, x):# x (batch, channel, feature, time)x = x.transpose(2, 3).contiguous # (batch, channel, time, feature)x = self.layer_norm(x)return x.transpose(2, 3).contiguous # (batch, channel, feature, time)class ResidualCNN(nn.Module):''''''Residual CNN inspired by https://arxiv.org/pdf/1603.05027.pdfexcept with layer norm instead of batch norm''''''def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):super(ResidualCNN, self).__init__self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding=kernel//2)self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding=kernel//2)self.dropout1 = nn.Dropout(dropout)self.dropout2 = nn.Dropout(dropout)self.layer_norm1 = CNNLayerNorm(n_feats)self.layer_norm2 = CNNLayerNorm(n_feats)def forward(self, x):residual = x # (batch, channel, feature, time)x = self.layer_norm1(x)x = F.gelu(x)x = self.dropout1(x)x = self.cnn1(x)x = self.layer_norm2(x)x = F.gelu(x)x = self.dropout2(x)x = self.cnn2(x)x += residualreturn x # (batch, channel, feature, time)class BidirectionalGRU(nn.Module):def __init__(self, rnn_dim, hidden_size, dropout, batch_first):super(BidirectionalGRU, self).__init__self.BiGRU = nn.GRU(input_size=rnn_dim, hidden_size=hidden_size,num_layers=1, batch_first=batch_first, bidirectional=True)self.layer_norm = nn.LayerNorm(rnn_dim)self.dropout = nn.Dropout(dropout)def forward(self, x):x = self.layer_norm(x)x = F.gelu(x)x, _ = self.BiGRU(x)x = self.dropout(x)return xclass SpeechRecognitionModel(nn.Module):''''''Speech Recognition Model Inspired by DeepSpeech 2''''''def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride=2, dropout=0.1):super(SpeechRecognitionModel, self).__init__n_feats = n_feats//2self.cnn = nn.Conv2d(1, 32, 3, stride=stride, padding=3//2) # cnn for extracting heirachal features# n residual cnn layers with filter size of 32self.rescnn_layers = nn.Sequential(*[ResidualCNN(32, 32, kernel=3, stride=1, dropout=dropout, n_feats=n_feats)for _ in range(n_cnn_layers)])self.fully_connected = nn.Linear(n_feats*32, rnn_dim)self.birnn_layers = nn.Sequential(*[BidirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2,hidden_size=rnn_dim, dropout=dropout, batch_first=i==0)for i in range(n_rnn_layers)])self.classifier = nn.Sequential(nn.Linear(rnn_dim*2, rnn_dim), # birnn returns rnn_dim*2nn.GELU,nn.Dropout(dropout),nn.Linear(rnn_dim, n_class))def forward(self, x):x = self.cnn(x)x = self.rescnn_layers(x)sizes = x.sizex = x.view(sizes[0], sizes[1] * sizes[2], sizes[3]) # (batch, feature, time)x = x.transpose(1, 2) # (batch, time, feature)x = self.fully_connected(x)x = self.birnn_layers(x)x = self.classifier(x)return x


推荐阅读