UFLDL教程（六）之栈式自编码器-蒲公英云

第 0 步：初始化一些参数和常数

第**1**步：利用训练样本集训练第一个稀疏编码器

第**2**步：利用训练样本集训练第二个稀疏编码器

第**3步：利用第二个稀疏编码器提取到的特征训练softmax**回归模型

第**4**步：利用误差反向传播进行微调

第**5**步：利用测试样本集对得到的分类器进行精度测试

下面将程序实现过程中的关键代码post出，欢迎各位网友指点！

stackedAEExercise.m

clc
clear
close all
addpath ../common/
addpath ../common/minFunc
%%======================================================================
%% STEP 0: 设置多层自编码器的相关参数
% 整个网络的输入输出结构
inputSize = 28 * 28; 
numClasses = 10;
% 稀疏自编码器结构
hiddenSizeL1 = 200;    % Layer 1 Hidden Size
hiddenSizeL2 = 200;    % Layer 2 Hidden Size
% 一些权值
sparsityParam = 0.1;    % desired average activation of the hidden units.that is ρ in the lecture
beta = 3;                    % weight of sparsity penalty term       
lambda = 3e-3;           % weight decay parameter       
%%======================================================================
%% STEP 1: 载入MNSIT数据集及标签集
addpath mnist\
trainData = loadMNISTImages('mnist/train-images-idx3-ubyte');
trainLabels = loadMNISTLabels('mnist/train-labels-idx1-ubyte');
trainLabels(trainLabels == 0) = 10; % Remap 0 to 10 since our labels need to start from 1
%%======================================================================
%% STEP 2: 训练第一个稀疏自编码器（训练样本集为trainData，看作是无标签训练样本集）
%  Randomly initialize the parameters
sae1Theta = initializeParameters(hiddenSizeL1, inputSize);
%  利用无标签样本集对稀疏自编码器进行学习，学习到的参数存放在向量sae1OptTheta中
% 优化函数的一些参数设置
options.Method = 'lbfgs'; 
options.maxIter = 400;      % Maximum number of iterations of L-BFGS to run 
options.display = 'on';
% 调用优化函数，得到优化向量sae1OptTheta
[sae1OptTheta, ~] = minFunc( @(p) sparseAutoencoderCost(p, ...
                                   inputSize, hiddenSizeL1, ...    %输入维数、输出维数
                                   lambda, sparsityParam, ...
                                   beta, trainData), ...
                              sae1Theta, options);
%save('sae1OptTheta.mat','sae1OptTheta')
% % 权值可视化（Visualize weights）
% W11 = reshape(sae1OptTheta(1:hiddenSizeL1 * inputSize), hiddenSizeL1, inputSize);
% display_network(W11');
% load('sae1OptTheta.mat');
%%======================================================================
%% STEP 3: 训练第二个稀疏自编码器（训练数据是第一个自编码器提取到的特征）
% 求解第一个自编码器的输出sae1Features（维数为hiddenSizeL1）
[sae1Features] = feedForwardAutoencoder(sae1OptTheta, hiddenSizeL1, ...
                                        inputSize, trainData);
%  Randomly initialize the parameters
sae2Theta = initializeParameters(hiddenSizeL2, hiddenSizeL1);
% 开始训练第二个自编码器，输入维数是hiddenSizeL1，输出维数是hiddenSizeL2，优化向量存放在sae2OptTheta中
[sae2OptTheta, ~] = minFunc( @(p) sparseAutoencoderCost(p, ...
                                   hiddenSizeL1, hiddenSizeL2, ... %输入维数、输出维数
                                   lambda, sparsityParam, ...
                                   beta, sae1Features), ...
                              sae2Theta, options);                        
% save('sae2OptTheta.mat','sae2OptTheta')  
% % Visualize weights 
% % W21 = reshape(sae2OptTheta(1:hiddenSizeL2 * hiddenSizeL1), hiddenSizeL2, hiddenSizeL1);
% % display_network(W21'); %无法可视化！！
% load('sae2OptTheta.mat');
%%======================================================================
%% STEP 4: 训练softmax classifier（它的输入为第二个自编码器提取到的特征sae2Features）
% 求解第二个自编码器的输出sae1Features（维数为hiddenSizeL2）
[sae2Features] = feedForwardAutoencoder(sae2OptTheta, hiddenSizeL2, ...
                                        hiddenSizeL1, sae1Features);
%  Randomly initialize the parameters
saeSoftmaxTheta = 0.005 * randn(hiddenSizeL2 * numClasses, 1);
% 开始优化softmax classifier，得到优化向量
options.maxIter = 100;
softmaxModel = softmaxTrain(size(sae2Features,1), numClasses, lambda, ...
                            sae2Features, trainLabels, options);
saeSoftmaxOptTheta=softmaxModel.optTheta(:);
% load('saeSoftmaxOptTheta.mat')
%%======================================================================
%% STEP 5: 微调多层自编码器
% 利用稀疏自编码(stack)和softmax分类器(saeSoftmaxOptTheta)学习到的参数作为微调模型的初始值
% 稀疏自编码的参数stack
stack = cell(2,1);%存放稀疏自编码器参数的元胞
stack{
    1}.w = reshape(sae1OptTheta(1:hiddenSizeL1*inputSize), ...
                     hiddenSizeL1, inputSize);
stack{
    1}.b = sae1OptTheta(2*hiddenSizeL1*inputSize+1:2*hiddenSizeL1*inputSize+hiddenSizeL1);
stack{
    2}.w = reshape(sae2OptTheta(1:hiddenSizeL2*hiddenSizeL1), ...
                     hiddenSizeL2, hiddenSizeL1);
stack{
    2}.b = sae2OptTheta(2*hiddenSizeL2*hiddenSizeL1+1:2*hiddenSizeL2*hiddenSizeL1+hiddenSizeL2);
[stackparams, netconfig] = stack2params(stack);%所有stack转化为向量形式，并提取稀疏自编码器的结构
% 整个模型参数（saeSoftmaxOptTheta+stack）
stackedAETheta = [ saeSoftmaxOptTheta ; stackparams ];
% 是否进行梯度检验    
DEBUG=1;                                      
if DEBUG
    checkStackedAECost()      
end      
% 开始进行微调优化 （Use minFunc to minimize the function）
[stackedAEOptTheta, cost] = minFunc( @(p) stackedAECost(p, ...
                                       inputSize, hiddenSizeL2,...%输入层维数、最后一个稀疏编码器隐藏层维数
                                       numClasses, netconfig, ...%稀疏自编码器的结构
                                       lambda, trainData, trainLabels), ...
                                       stackedAETheta, options); 
%%======================================================================
%% STEP 6: Test 
% 获取有标签样本集
testData = loadMNISTImages('mnist/t10k-images-idx3-ubyte');
testLabels = loadMNISTLabels('mnist/t10k-labels-idx1-ubyte');
testLabels(testLabels == 0) = 10; % Remap 0 to 10
% 进行预测（微调后的）
[pred] = stackedAEPredict(stackedAEOptTheta, inputSize, hiddenSizeL2, ...
                          numClasses, netconfig, testData);
acc = mean(testLabels(:) == pred(:));% 计算预测精度
fprintf('After Finetuning Test Accuracy: %0.3f%%\n', acc * 100);
% 进行预测（微调前的）
[pred] = stackedAEPredict(stackedAETheta, inputSize, hiddenSizeL2, ...
                          numClasses, netconfig, testData);
acc = mean(testLabels(:) == pred(:));% 计算预测精度
fprintf('Before Finetuning Test Accuracy: %0.3f%%\n', acc * 100);
% Accuracy is the proportion of correctly classified images
% The results for our implementation were:
% Before Finetuning Test Accuracy: 87.7%
% After Finetuning Test Accuracy:  97.6%
%
% If your values are too low (accuracy less than 95%), you should check 
% your code for errors, and make sure you are training on the 
% entire data set of 60000 28x28 training images 
% (unless you modified the loading code, this should be the case)

stackedAEPredict.m

% stackedAEPredict: Takes a trained theta and a test data set,
% and returns the predicted labels for each example.
% theta: trained weights from the autoencoder
% visibleSize: the number of input units
% hiddenSize:  the number of hidden units *at the 2nd layer*
% numClasses:  the number of categories
% data: Our matrix containing the training data as columns.  So, data(:,i) is the i-th training example. 
% Your code should produce the prediction matrix 
% pred, where pred(i) is argmax_c P(y(c) | x(i)).
function [pred] = stackedAEPredict(theta, inputSize, hiddenSize, numClasses, netconfig, data)
%% Unroll theta parameter
% We first extract the part which compute the softmax gradient
softmaxTheta = reshape(theta(1:hiddenSize*numClasses), numClasses, hiddenSize);
% Extract out the "stack"
stack = params2stack(theta(hiddenSize*numClasses+1:end), netconfig);
%% ---------- YOUR CODE HERE --------------------------------------
%  Instructions: Compute pred using theta assuming that the labels start from 1.
%% 前向传播计算
a{
    1}=data;
depth=numel(netconfig.layersizes);
for i=1:depth
    a{i+1}=sigmoid(bsxfun(@plus,stack{i}.w*a{i},stack{i}.b)); 
end
%% softmax模型的输出Htheta
softmaxData=a{depth+1};%softmax的输入即为stack自编码器最后一层的输出
M=softmaxTheta*softmaxData;%矩阵M
M=bsxfun(@minus,M,max(M));%减去行向量α，防止数据溢出
Htheta=bsxfun(@rdivide,exp(M),sum(exp(M)));%softmax模型的假设函数输出
%% 计算Htheta每一列最大元素所在位置，即为该列所对应样本的类别
[~,pred]=max(Htheta);
end
% You might find this useful
function sigm = sigmoid(x)
    sigm = 1 ./ (1 + exp(-x));
end

stackedAECost.m

%{
Takes a trained softmaxTheta and a training data set with labels,
and returns cost and gradient using a stacked autoencoder model. Used for finetuning.
输入：
theta：整个网络的权值向量
visibleSize: 网络的输入层维数
hiddenSize:  最后一个稀疏自编码器的隐藏层维数
numClasses:  类别总数
netconfig:   the network configuration of the stack
lambda:      the weight regularization penalty
data: 训练样本集，data(:,i) is the i-th training example. 
labels: 训练样本集的标签, where labels(i) is the label for the i-th training example
输出：
cost：代价函数
grad：梯度向量
%}                                   
function [ cost, grad ] = stackedAECost(theta, ...
                                              inputSize, hiddenSize, ...%输入层维数、最后一个稀疏编码器隐藏层维数
                                              numClasses, netconfig, ...%总类数、稀疏自编码器的结构
                                              lambda, data, labels)     
%% 从输入的网络参数向量theta中得到softmax分类器和稀疏自编码器的参数
softmaxTheta = reshape(theta(1:hiddenSize*numClasses), numClasses, hiddenSize);%softmax的参数矩阵
stack = params2stack(theta(hiddenSize*numClasses+1:end), netconfig);% Extract out the "stack"
%% 初始化
%样本个数
numCases = size(data, 2); 
%样本标签矩阵groundTruth（即I阵）
groundTruth = full(sparse(labels, 1:numCases, 1));
% softmax分类器的梯度
softmaxThetaGrad = zeros(size(softmaxTheta));
% 稀疏自编码器的梯度（权值w和偏执项b）
stackgrad = cell(size(stack));
for d = 1:numel(stack)
    stackgrad{d}.w = zeros(size(stack{d}.w));
    stackgrad{d}.b = zeros(size(stack{d}.b));
end
%% 前向传播算法
% 初始化工作 
depth=numel(stack);% 稀疏自编码器隐藏层的层数(the layor of the network)
z=cell(depth+1,1); % stack网络各层的激励值
a=cell(depth+1,1); % stack网络各层的激励值
a{
    1}=data; % 输入层数据
% 各稀疏自编码器输出a{
   2},...,a{depth+1}
for i=1:depth
    %各稀疏编码器提取到的features 
    z{i+1}=bsxfun(@plus,stack{i}.w*a{i},stack{i}.b);
    a{i+1}=sigmoid(z{i+1});
end
% softmax分类器的输出Htheta
softmaxData=a{depth+1};%softmax的输入即为stack自编码器最后一层的输出
M=softmaxTheta*softmaxData;%矩阵M
M=bsxfun(@minus,M,max(M));%减去行向量α，防止数据溢出
Htheta=bsxfun(@rdivide,exp(M),sum(exp(M)));%softmax分类器的假设函数输出
%% 多层网络代价函数的计算（%要对整个网络的所有参数，包括softmax分类器和自编码器的所有参数）
cost=-sum(sum(groundTruth.*log(Htheta)))/numCases+lambda*sum(softmaxTheta(:).^2)/2;
%% 梯度计算
% softmax层的梯度
softmaxThetaGrad=-(groundTruth-Htheta)*softmaxData'/numCases+lambda*softmaxTheta;
%  稀疏自编码层
%  敏感度
delta=cell(depth+1,1);
delta{depth+1}=-softmaxTheta'*(groundTruth-Htheta).*a{depth+1}.*(1-a{depth+1});
for i=depth:-1:2
    delta{i}=stack{i}.w'*delta{i+1}.*(a{i}).*(1-a{i});  
end
% 梯度值
for i=depth:-1:1
    stackgrad{i}.w=delta{i+1}*a{i}'/numCases;
    stackgrad{i}.b=sum(delta{i+1},2)'/numCases;
    if size(stackgrad{i}.b,2)~=1
        stackgrad{i}.b=stackgrad{i}.b';
    end
end
%% Roll gradient vector
grad = [softmaxThetaGrad(:) ; stack2params(stackgrad)];
end
% You might find this useful
function sigm = sigmoid(x)
    sigm = 1 ./ (1 + exp(-x));
end