\[J(\Theta) = - \frac{1}{m} \left[ \sum\limits_{i=1}^m \sum\limits_{k=1}^K y_k^{(i)}~log((h_\Theta(x^{(i)}))_k) + (1 -y_k^{(i)})~log(1 - (h_\Theta(x^{(i)}))_k) \right] + \frac{\lambda}{2m} \sum\limits_{l=1}^{L-1} \sum\limits_{i=1}^{s_l} \sum\limits_{j=1}^{s_{l+1}} (\Theta_{ji}^{(l)})^2 \]
\[\delta^{(3)} = (\Theta^{(3)})^T\delta^{(4)}.*g'(z^{(3)}) \text{ where } g'(z^{(3)}) = a^{(3)} .* (1-a^{(3)})\] \[\delta^{(2)} = (\Theta^{(2)})^T\delta^{(3)}.*g'(z^{(2)}) \text{ where } g'(z^{(2)}) = a^{(2)} .* (1-a^{(2)})\]
\[\frac{\partial}{\partial\Theta_{ij}^{(l)}} J(\Theta) = a_j^{(l)}\delta_i^{(l+1)}\]
⇒ Finally, it can be prooved that \(\frac{\partial}{\partial\Theta_{ij}^{(l)}} J(\Theta) = D_{ij}^{(l)}\).
function [jVal, gradient] = costFunction(theta) ... optTheta = fminunc(@costFunction,initialTheta, options);
thetaVec = [ Theta1(:); Theta2(:); Theta3(:) ]; DVec = [ D1(:); D2(:); D3(:)];
Theta1 = reshape(thetaVec(1:110),10,11); Theta2 = reshape(thetaVec(111:220),10,11); Theta3 = reshape(thetaVec(221:231),1,11);
gradApprox = (J(theta + EPSILON) - J(theta - EPSILON))/(2*EPSILON)
\[\frac{\partial}{\partial \theta_i} J(\theta) \approx \frac{J(\theta_1,\dots,\theta_i+\epsilon,\dots,\theta_n) - J(\theta_1,\cdots,\theta_i-\epsilon,\cdots,\theta_n)}{2\epsilon}\]
for i=1:n, thetaPlus = theta; thetaPlus(i) = thetaPlus(i) + EPSILON; thetaMinus = theta; thetaMinus(i) = thetaMinus(i) - EPSILON; gradApprox(i) = (J(thetaPlus) - J(thetaMinus))/(2*EPSILON); end;
Theta1 = rand(10,11)*(2*INIT_EPSILON) - INIT_EPSILON; Theta2 = rand(1,11)*(2*INIT_EPSILON) - INIT_EPSILON;