利用 AssemblyAI 在 PyTorch 中建立端到端的语音识别模型( 三 )
本文插图
递归神经网络(RNN)擅长处理序列建模问题 。 RNN会逐步处理音频特征 , 在使用前一帧的上下文的同时对每一帧进行预测 。 我们使用BiRNN是因为我们不仅需要每个步骤之前框架的上下文 , 还希望得到它之后框架的上下文 。
这可以帮助模型做出更好的预测 , 因为音频中的每一帧在进行预测之前都会有更多信息 。 我们使用RNN的门控递归单元(GRU)变种 , 因为它比LSTM需要的的计算资源更少 , 并且在某些情况下工作效果也一样 。
该模型为输出字符的概率矩阵 , 我们将使用该矩阵将其输入到解码器中 , 提取模型认为是概率最高的字符 。
class CNNLayerNorm(nn.Module):''''''Layer normalization built for cnns input''''''def __init__(self, n_feats):super(CNNLayerNorm, self).__init__self.layer_norm = nn.LayerNorm(n_feats)def forward(self, x):# x (batch, channel, feature, time)x = x.transpose(2, 3).contiguous # (batch, channel, time, feature)x = self.layer_norm(x)return x.transpose(2, 3).contiguous # (batch, channel, feature, time)class ResidualCNN(nn.Module):''''''Residual CNN inspired by https://arxiv.org/pdf/1603.05027.pdfexcept with layer norm instead of batch norm''''''def __init__(self, in_channels, out_channels, kernel, stride, dropout, n_feats):super(ResidualCNN, self).__init__self.cnn1 = nn.Conv2d(in_channels, out_channels, kernel, stride, padding=kernel//2)self.cnn2 = nn.Conv2d(out_channels, out_channels, kernel, stride, padding=kernel//2)self.dropout1 = nn.Dropout(dropout)self.dropout2 = nn.Dropout(dropout)self.layer_norm1 = CNNLayerNorm(n_feats)self.layer_norm2 = CNNLayerNorm(n_feats)def forward(self, x):residual = x # (batch, channel, feature, time)x = self.layer_norm1(x)x = F.gelu(x)x = self.dropout1(x)x = self.cnn1(x)x = self.layer_norm2(x)x = F.gelu(x)x = self.dropout2(x)x = self.cnn2(x)x += residualreturn x # (batch, channel, feature, time)class BidirectionalGRU(nn.Module):def __init__(self, rnn_dim, hidden_size, dropout, batch_first):super(BidirectionalGRU, self).__init__self.BiGRU = nn.GRU(input_size=rnn_dim, hidden_size=hidden_size,num_layers=1, batch_first=batch_first, bidirectional=True)self.layer_norm = nn.LayerNorm(rnn_dim)self.dropout = nn.Dropout(dropout)def forward(self, x):x = self.layer_norm(x)x = F.gelu(x)x, _ = self.BiGRU(x)x = self.dropout(x)return xclass SpeechRecognitionModel(nn.Module):''''''Speech Recognition Model Inspired by DeepSpeech 2''''''def __init__(self, n_cnn_layers, n_rnn_layers, rnn_dim, n_class, n_feats, stride=2, dropout=0.1):super(SpeechRecognitionModel, self).__init__n_feats = n_feats//2self.cnn = nn.Conv2d(1, 32, 3, stride=stride, padding=3//2) # cnn for extracting heirachal features# n residual cnn layers with filter size of 32self.rescnn_layers = nn.Sequential(*[ResidualCNN(32, 32, kernel=3, stride=1, dropout=dropout, n_feats=n_feats)for _ in range(n_cnn_layers)])self.fully_connected = nn.Linear(n_feats*32, rnn_dim)self.birnn_layers = nn.Sequential(*[BidirectionalGRU(rnn_dim=rnn_dim if i==0 else rnn_dim*2,hidden_size=rnn_dim, dropout=dropout, batch_first=i==0)for i in range(n_rnn_layers)])self.classifier = nn.Sequential(nn.Linear(rnn_dim*2, rnn_dim), # birnn returns rnn_dim*2nn.GELU,nn.Dropout(dropout),nn.Linear(rnn_dim, n_class))def forward(self, x):x = self.cnn(x)x = self.rescnn_layers(x)sizes = x.sizex = x.view(sizes[0], sizes[1] * sizes[2], sizes[3]) # (batch, feature, time)x = x.transpose(1, 2) # (batch, time, feature)x = self.fully_connected(x)x = self.birnn_layers(x)x = self.classifier(x)return x
推荐阅读
- 爱我就别想太多:爱我就别想太多:最阴险的女人曝光,比薛瑛更可恨,夏可可被利用
- 樱寺静语日本的佳能电子卫星,利用美国火箭发射失利
- 亦君|一男一女哈佛两学霸作息大公开,相似处太多,男生每天睡觉9小时起床充分利用碎片化时间都用番茄工作法都严格控制了手机的使用都很重视健身,
- 小强幽默|公司如何做好网络营销推广找客户,园林花卉行业企业怎么利用网络推广引流获客
- 风雨同进步|公司如何做好网络营销推广找客户,旅游行业企业怎么利用网络推广引流获客
- 小强幽默酒店行业企业怎么利用网络推广引流获客,公司如何做好网络营销推广找客户
- 守望者加速器de小编|GTA5:利用BUG赚钱的方法你会吗 摩托帮工厂省钱方法介绍
- 王者荣耀|深度攻略|利用自身爆发让敌方快速减员,将敌人逐一击破!
- 全新等离子体光子芯片:利用光进行超高速数据传输
- 兰州新闻网|省二院利用FNS微创治疗股骨颈骨折
