NN from scratch¶
Below are some simple codes that are easy to alter to get a better understanding of neural network training. You can play with the learning rate, number of iterations, layer size, and easily extend ot more than 2 layers based on the basic programs.
#install.packages("torch")
#install.packages("text2vec")
#install.packages("sigmoid")
library(torch)
library(text2vec)
library(sigmoid)
# packages that allow for autograd - derivatives from loss functions etc
# 1 hidden layer NN
myNN1CE <- function(x,y,tr,d1_hidden,d_out,M,learning_rate) {
#
d_in <- dim(x)[2] # input dimension
#
# traning and validation set
xtr <- x[tr,]
ytr <- y[tr]
xte <- x[seq(1,length(y))[-tr],]
yte <- y[seq(1,length(y))[-tr]]
#
# dimensionality of hidden layer
# d1_hidden
# output dimensionality (number of predicted features)
# d_out
#
# weights connecting input to hidden layer
w1 <- torch_randn(d_in, d1_hidden, requires_grad = TRUE)
# weights connecting hidden to output layer
w2 <- torch_randn(d1_hidden, d_out, requires_grad = TRUE)
# hidden layer bias (intercept in the linear predictor)
b1 <- torch_zeros(1, d1_hidden, requires_grad = TRUE)
# output layer bias (intercept in the linear predictor)
b2 <- torch_zeros(1, d_out, requires_grad = TRUE)
#######
### training loop ----------------------------------------
# M = number of epochs
lvec <- rep(0,M)
tvec <- rep(0,M)
#
for (t in 1:M) {
### -------- Forward pass --------
y_tr <- xtr$mm(w1)$add(b1)$relu()$mm(w2)$add(b2)$sigmoid()
y_te <- xte$mm(w1)$add(b1)$relu()$mm(w2)$add(b2)$sigmoid()
### -------- Compute loss --------
loss <- nnf_binary_cross_entropy(y_tr, ytr)$mean()
### trace losses
lvec[t] <- loss$item() # training loss
tloss <- nnf_binary_cross_entropy(y_te, yte)$mean()
tvec[t] <- tloss$item() # test loss
### -------- Backpropagation --------
# compute gradient of loss w.r.t. all tensors
loss$backward()
### -------- Update weights --------
# Wrap in with_no_grad() because this is a part we don't
# want to record for automatic gradient computation
with_no_grad({
w1 <- w1$sub_(learning_rate * w1$grad)
w2 <- w2$sub_(learning_rate * w2$grad)
b1 <- b1$sub_(learning_rate * b1$grad)
b2 <- b2$sub_(learning_rate * b2$grad)
# Zero gradients after every pass, as they'd
# accumulate otherwise when you add to the weight updates
w1$grad$zero_()
w2$grad$zero_()
b1$grad$zero_()
b2$grad$zero_()
})
}
return(list(Train=y_tr,Pred=y_te,TR=lvec,TE=tvec,w1=w1,w2=w2,b1=b1,b2=b2)) }
### 2 hidden layer network
myNN2CE <- function(x,y,tr,d1a_hidden,d1b_hidden,d_out,M,learning_rate) {
# input layer dimension
d_in <- dim(x)[2]
#
xtr <- x[tr,]
ytr <- y[tr]
xte <- x[seq(1,length(y))[-tr],]
yte <- y[seq(1,length(y))[-tr]]
#
# dimensionality of hidden layers
# d1a_hidden
# d1b_hidden
# output dimensionality (number of predicted features)
# d_out <- 1
# weights connecting input to hidden layer
w1a <- torch_randn(d_in, d1a_hidden, requires_grad = TRUE)
w1b <- torch_randn(d1a_hidden, d1b_hidden, requires_grad = TRUE)
# weights connecting hidden to output layer
w2 <- torch_randn(d1b_hidden, d_out, requires_grad = TRUE)
# hidden layer bias (intercept)
b1a <- torch_zeros(1, d1a_hidden, requires_grad = TRUE)
b1b <- torch_zeros(1, d1b_hidden, requires_grad = TRUE)
# output layer bias (intercept)
b2 <- torch_zeros(1, d_out, requires_grad = TRUE)
#######
### training loop ----------------------------------------
lvec <- rep(0,M)
tvec <- rep(0,M)
for (t in 1:M) {
### -------- Forward pass --------
y_tr <- xtr$mm(w1a)$add(b1a)$relu()$mm(w1b)$add(b1b)$relu()$mm(w2)$add(b2)$sigmoid() # play with different activation functions here
y_te <- xte$mm(w1a)$add(b1a)$relu()$mm(w1b)$add(b1b)$relu()$mm(w2)$add(b2)$sigmoid()
### -------- Compute loss --------
loss <- nnf_binary_cross_entropy(y_tr, ytr)$mean()
#loss <- (y_tr - ytr)$pow(2)$mean() #MSE loss if you want to try this for regression
lvec[t] <- loss$item()
tloss <- nnf_binary_cross_entropy(y_te, yte)$mean()
tvec[t] <- tloss$item()
### -------- Backpropagation --------
# compute gradient of loss w.r.t. all tensors with
# requires_grad = TRUE
loss$backward()
### -------- Update weights --------
# Wrap in with_no_grad() because this is a part we don't
# want to record for automatic gradient computation
with_no_grad({
w1a <- w1a$sub_(learning_rate * w1a$grad)
w1b <- w1b$sub_(learning_rate * w1b$grad)
w2 <- w2$sub_(learning_rate * w2$grad)
b1a <- b1a$sub_(learning_rate * b1a$grad)
b1b <- b1b$sub_(learning_rate * b1b$grad)
b2 <- b2$sub_(learning_rate * b2$grad)
# Zero gradients after every pass, as they'd
# accumulate otherwise
w1a$grad$zero_()
w1b$grad$zero_()
w2$grad$zero_()
b1a$grad$zero_()
b1b$grad$zero_()
b2$grad$zero_()
})
}
return(list(Train=y_tr,Pred=y_te,TR=lvec,TE=tvec,w1a=w1a,w1b=w1b,w2=w2,b1a=b1a,b1b=b1b,b2=b2)) }
## Simulated data
TwoCircleData <- function(N1,N2,r1,r2,noise) {
# Circle 1
theta1 <- runif(N1, 0, 2 * pi)
x1 <- r1 * cos(theta1) + rnorm(N1,sd=noise)
x2 <- r1 * sin(theta1) + rnorm(N1,sd=noise)
x <- cbind(x1,x2)
y <- rep(0,length(x1))
# Circle 2
theta2 <- runif(N2, 0, 2 * pi)
x1 <- r2 * cos(theta2) + rnorm(N2,sd=noise)
x2 <- r2 * sin(theta2) + rnorm(N2,sd=noise)
x <- rbind(x,cbind(x1,x2))
y <- c(y, rep(1,length(x1)))
return(list(x=x,y=y))
}
### Predicting on new data
NN1predictions <- function(xnew,NN) {
library(sigmoid)
w2 <- as.matrix(NN$w2)
b2 <- as.matrix(NN$b2)
w1 <- as.matrix(NN$w1)
b1 <- as.matrix(NN$b1)
#
z1 <- as.matrix(xnew)%*%w1
z1 <- relu(z1 + rep(b1,each=nrow(z1)))
z2 <- z1%*%w2
z2 <- sigmoid(z2 + rep(b2,each=nrow(z2)))
ynew <- z2
return(ynew)
}
NN2predictions <- function(xnew,NN) {
library(sigmoid)
w2 <- as.matrix(NN$w2)
b2 <- as.matrix(NN$b2)
w1a <- as.matrix(NN$w1a)
b1a <- as.matrix(NN$b1a)
w1b <- as.matrix(NN$w1b)
b1b <- as.matrix(NN$b1b)
#
z1 <- as.matrix(xnew)%*%w1a
z1 <- relu(z1 + rep(b1a,each=nrow(z1)))
z1 <- as.matrix(z1)%*%w1b
z1 <- relu(z1 + rep(b1b,each=nrow(z1)))
z2 <- z1%*%w2
z2 <- sigmoid(z2 + rep(b2,each=nrow(z2)))
ynew <- z2
return(ynew)
}
#library(torch)
MM <- TwoCircleData(2500,2500,2,3.25,.5)
plot(MM$x,col=MM$y+1)
x <- torch_tensor(MM$x)
y <- torch_tensor(MM$y)
xnew <- (expand.grid(x1 = seq(-5, 5, by = 0.1),
x2 = seq(-5, 5, by = 0.1)))
tr <- sample(seq(1,5000),500) # sample sizes 1000, 200, 100
mm<-myNN1CE(x,y,tr,10,1,2000,1e-2)
mm2<-myNN2CE(x,y,tr,10,10,1,2000,1e-2)
plot(mm$TR,type="l",ylim=c(0,1))
lines(mm$TE,lty=2)
lines(mm2$TR,col=2)
lines(mm2$TE,col=2,lty=2)
# dashed are validation, solid training
# red is 2 layer network
Here, the two layer network was clearly better. Try with smaller sample sizes also. For the small samples the 2-layer network will start to perform worse than the single-layer. Explore at home.
NNp <- NN1predictions(xnew,mm)
probN <- round(NNp)
library(dplyr)
library(ggplot2)
df1 <- mutate(xnew, prob = probN, class = 0,
prob_cls = ifelse(probN == class, 1, 0))
df2 <- mutate(xnew, prob = probN, class = 1,
prob_cls = ifelse(probN == class, 1, 0))
bigdf <- bind_rows(df1, df2)
trdf <- as.data.frame(cbind(as.matrix(x[tr,]),as.integer(y[tr])))
names(trdf) <- c("x1","x2","class")
ggplot(bigdf) +
geom_point(aes(x=x1, y =x2, col=class),
data = mutate(xnew, class = as.factor(probN)),
size = .25) +
geom_point(aes(x = x1, y = x2, col = as.factor(class)),
size = 4, shape = 1, data = trdf) +
geom_contour(aes(x = x1, y = x2, z = prob_cls,
group = as.factor(class), color = as.factor(class)),
size = 1, bins = 1, data = bigdf) + theme_bw()
NNp <- NN2predictions(xnew,mm2)
probN <- round(NNp)
library(dplyr)
library(ggplot2)
df1 <- mutate(xnew, prob = probN, class = 0,
prob_cls = ifelse(probN == class, 1, 0))
df2 <- mutate(xnew, prob = probN, class = 1,
prob_cls = ifelse(probN == class, 1, 0))
bigdf <- bind_rows(df1, df2)
trdf <- as.data.frame(cbind(as.matrix(x[tr,]),as.integer(y[tr])))
names(trdf) <- c("x1","x2","class")
ggplot(bigdf) +
geom_point(aes(x=x1, y =x2, col=class),
data = mutate(xnew, class = as.factor(probN)),
size = .25) +
geom_point(aes(x = x1, y = x2, col = as.factor(class)),
size = 4, shape = 1, data = trdf) +
geom_contour(aes(x = x1, y = x2, z = prob_cls,
group = as.factor(class), color = as.factor(class)),
size = 1, bins = 1, data = bigdf) + theme_bw()
Explore how the decision boundaries of the single and two-layer networks vary as a function of training size.
Let's add stochasticity to the training, i.e. run the training on random batches of data.
myNN1CEbatch <- function(x,y,tr,d1_hidden,d_out,M,batch,learning_rate) {
# batch is another tuning parameter - the size of the batch to train on.
#
d_in <- dim(x)[2]
#
xtr <- x[tr,]
ytr <- y[tr]
xte <- x[seq(1,length(y))[-tr],]
yte <- y[seq(1,length(y))[-tr]]
# batches for epochs
if (batch!=0) {
batchnumbers <- floor(length(tr)/batch)
B <- batchnumbers
bl <- floor(length(tr)/B)
bins <- rep(seq(1,B),bl)
if (length(bins)<length(tr)) {
bins<-c(bins,seq(1,length(tr)-length(bins)))}
bins<-sample(bins,length(bins)) }# scramble order of observations in batches
#
if (batch==0) { # full batch mode
B<-1
bins<-rep(1,length(tr)) }
#####
# dimensionality of hidden layer
#d1_hidden <- 10
#
# output dimensionality (number of predicted features)
#d_out <- 1
# weights connecting input to hidden layer
w1 <- torch_randn(d_in, d1_hidden, requires_grad = TRUE)
# weights connecting hidden to output layer
w2 <- torch_randn(d1_hidden, d_out, requires_grad = TRUE)
# hidden layer bias (intercept)
b1 <- torch_zeros(1, d1_hidden, requires_grad = TRUE)
# output layer bias
b2 <- torch_zeros(1, d_out, requires_grad = TRUE)
#######
### training loop ----------------------------------------
#M<-5000
lvec <- rep(0,M*B) #batchnumbers*iterations
tvec <- rep(0,M*B)
zz<-0
for (t in 1:M) {
for (bb in (1:B)) {
zz <- zz+1
# get data for current batch
xtrb <- xtr[bins==bb,]
ytrb <- ytr[bins==bb]
### -------- Forward pass --------
y_trb <- xtrb$mm(w1)$add(b1)$relu()$mm(w2)$add(b2)$sigmoid() # update predictions on batch
y_tr <- xtr$mm(w1)$add(b1)$relu()$mm(w2)$add(b2)$sigmoid() # update the predictions on full training data
y_te <- xte$mm(w1)$add(b1)$relu()$mm(w2)$add(b2)$sigmoid()
### -------- Compute loss --------
loss <- nnf_binary_cross_entropy(y_trb, ytrb)$mean() # only use the batch for loss computation and gradient updates
#loss <- (y_tr - ytr)$pow(2)$mean()
lvec[zz] <- loss$item()
tloss <- nnf_binary_cross_entropy(y_te, yte)$mean()
tvec[zz] <- tloss$item()
### -------- Backpropagation --------
# compute gradient of loss w.r.t. all tensors with
# requires_grad = TRUE
loss$backward()
### -------- Update weights --------
# Wrap in with_no_grad() because this is a part we don't
# want to record for automatic gradient computation
with_no_grad({
w1 <- w1$sub_(learning_rate * w1$grad)
w2 <- w2$sub_(learning_rate * w2$grad)
b1 <- b1$sub_(learning_rate * b1$grad)
b2 <- b2$sub_(learning_rate * b2$grad)
# Zero gradients after every pass, as they'd
# accumulate otherwise
w1$grad$zero_()
w2$grad$zero_()
b1$grad$zero_()
b2$grad$zero_()
})
}}
#y_new <- xnew$mm(w1)$add(b1)$relu()$mm(w2)$add(b2)$sigmoid()
return(list(Train=y_tr,Pred=y_te,TR=lvec,TE=tvec,w1=w1,w2=w2,b1=b1,b2=b2)) }
mmb<-myNN1CEbatch(x,y,tr,10,1,2000,100,1e-2) # batch size 100 - explore other sizes
batchs<-100
trs<-length(tr)
plot(mmb$TR[seq(1, length(mmb$TR),by=floor(trs/batchs))],type="l",ylim=c(0,1),col=2)
lines(mmb$TE[seq(1,length(mmb$TR),by=floor(trs/batchs))],lty=2,col=2)
lines(mm$TR,col=1)
lines(mm$TE,lty=2,col=1)
NNp <- NN1predictions(xnew,mmb)
probN <- round(NNp)
library(dplyr)
library(ggplot2)
df1 <- mutate(xnew, prob = probN, class = 0,
prob_cls = ifelse(probN == class, 1, 0))
df2 <- mutate(xnew, prob = probN, class = 1,
prob_cls = ifelse(probN == class, 1, 0))
bigdf <- bind_rows(df1, df2)
trdf <- as.data.frame(cbind(as.matrix(x[tr,]),as.integer(y[tr])))
names(trdf) <- c("x1","x2","class")
ggplot(bigdf) +
geom_point(aes(x=x1, y =x2, col=class),
data = mutate(xnew, class = as.factor(probN)),
size = .25) +
geom_point(aes(x = x1, y = x2, col = as.factor(class)),
size = 4, shape = 1, data = trdf) +
geom_contour(aes(x = x1, y = x2, z = prob_cls,
group = as.factor(class), color = as.factor(class)),
size = 1, bins = 1, data = bigdf) + theme_bw()
NNp <- NN1predictions(xnew,mm)
probN <- round(NNp)
library(dplyr)
library(ggplot2)
df1 <- mutate(xnew, prob = probN, class = 0,
prob_cls = ifelse(probN == class, 1, 0))
df2 <- mutate(xnew, prob = probN, class = 1,
prob_cls = ifelse(probN == class, 1, 0))
bigdf <- bind_rows(df1, df2)
trdf <- as.data.frame(cbind(as.matrix(x[tr,]),as.integer(y[tr])))
names(trdf) <- c("x1","x2","class")
ggplot(bigdf) +
geom_point(aes(x=x1, y =x2, col=class),
data = mutate(xnew, class = as.factor(probN)),
size = .25) +
geom_point(aes(x = x1, y = x2, col = as.factor(class)),
size = 4, shape = 1, data = trdf) +
geom_contour(aes(x = x1, y = x2, z = prob_cls,
group = as.factor(class), color = as.factor(class)),
size = 1, bins = 1, data = bigdf) + theme_bw()
Notice how the SGD, or batch training, resulted in a better fit and smoother decision boundaries. Try with different batch sizes, learning rates etc.
We can do the same for the two-layer network.
myNN2CEbatch <- function(x,y,tr,d1a_hidden,d1b_hidden,d_out,M,batch, learning_rate) {
d_in <- dim(x)[2]
xtr <- x[tr,]
ytr <- y[tr]
xte <- x[seq(1,length(y))[-tr],]
yte <- y[seq(1,length(y))[-tr]]
#
if (batch!=0) {
batchnumbers <- floor(length(tr)/batch)
B <- batchnumbers
bl <- floor(length(tr)/B)
bins <- rep(seq(1,B),bl)
if (length(bins)<length(tr)) {
bins<-c(bins,seq(1,length(tr)-length(bins)))}
bins<-sample(bins,length(bins)) }# scramble order of observations in batches
#
if (batch==0) { # full batch mode
B<-1
bins<-rep(1,length(tr)) }
#d1a_hidden <- 10
#d1b_hidden <- 40
# output dimensionality (number of predicted features)
#d_out <- 1
# weights connecting input to hidden layer
w1a <- torch_randn(d_in, d1a_hidden, requires_grad = TRUE)
w1b <- torch_randn(d1a_hidden, d1b_hidden, requires_grad = TRUE)
# weights connecting hidden to output layer
w2 <- torch_randn(d1b_hidden, d_out, requires_grad = TRUE)
# hidden layer bias
b1a <- torch_zeros(1, d1a_hidden, requires_grad = TRUE)
b1b <- torch_zeros(1, d1b_hidden, requires_grad = TRUE)
# output layer bias
b2 <- torch_zeros(1, d_out, requires_grad = TRUE)
#######
### training loop ----------------------------------------
#M<-5000
lvec <- rep(0,M*B)
tvec <- rep(0,M*B)
zz <- 0
for (t in 1:M) {
for (bb in (1:B)) {
zz <- zz+1
xtrb <- xtr[bins==bb,]
ytrb <- ytr[bins==bb]
### -------- Forward pass --------
y_trb <- xtrb$mm(w1a)$add(b1a)$relu()$mm(w1b)$add(b1b)$relu()$mm(w2)$add(b2)$sigmoid()
y_tr <- xtr$mm(w1a)$add(b1a)$relu()$mm(w1b)$add(b1b)$relu()$mm(w2)$add(b2)$sigmoid()
y_te <- xte$mm(w1a)$add(b1a)$relu()$mm(w1b)$add(b1b)$relu()$mm(w2)$add(b2)$sigmoid()
### -------- Compute loss --------
loss <- nnf_binary_cross_entropy(y_trb, ytrb)$mean()
#loss <- (y_tr - ytr)$pow(2)$mean()
lvec[zz] <- loss$item()
tloss <- nnf_binary_cross_entropy(y_te, yte)$mean()
tvec[zz] <- tloss$item()
### -------- Backpropagation --------
# compute gradient of loss w.r.t. all tensors with
# requires_grad = TRUE
loss$backward()
### -------- Update weights --------
# Wrap in with_no_grad() because this is a part we don't
# want to record for automatic gradient computation
with_no_grad({
w1a <- w1a$sub_(learning_rate * w1a$grad)
w1b <- w1b$sub_(learning_rate * w1b$grad)
w2 <- w2$sub_(learning_rate * w2$grad)
b1a <- b1a$sub_(learning_rate * b1a$grad)
b1b <- b1b$sub_(learning_rate * b1b$grad)
b2 <- b2$sub_(learning_rate * b2$grad)
# Zero gradients after every pass, as they'd
# accumulate otherwise
w1a$grad$zero_()
w1b$grad$zero_()
w2$grad$zero_()
b1a$grad$zero_()
b1b$grad$zero_()
b2$grad$zero_()
})
}}
#y_new <- xnew$mm(w1a)$add(b1a)$relu()$mm(w1b)$add(b1b)$relu()$mm(w2)$add(b2)$sigmoid()
return(list(Train=y_tr,Pred=y_te,TR=lvec,TE=tvec,w1a=w1a,w1b=w1b,w2=w2,b1a=b1a,b1b=b1b,b2=b2)) }
mmb2<-myNN2CEbatch(x,y,tr,10,10,1,2000,50,1e-2) # batch 50, M=2000
batchs<-50
trs<-length(tr)
plot(mmb2$TR[seq(1, length(mmb2$TR),by=floor(trs/batchs))],type="l",ylim=c(0,1),col=2,xlim=c(0,length(mm$TR)))
lines(mmb2$TE[seq(1,length(mmb2$TR),by=floor(trs/batchs))],lty=2,col=2)
lines(mm2$TR,col=1)
lines(mm2$TE,lty=2,col=1)
# 1-layer network for comparison, in green (batch) and blue (vanilla)
batchs<-100
lines(mmb$TR[seq(1,length(mmb$TR),by=floor(trs/batchs))],lty=1,col=3)
lines(mmb$TE[seq(1,length(mmb$TR),by=floor(trs/batchs))],lty=2,col=3)
lines(mm$TR,col=4)
lines(mm$TE,lty=2,col=4)
NNp <- NN2predictions(xnew,mmb2)
probN <- round(NNp)
library(dplyr)
library(ggplot2)
df1 <- mutate(xnew, prob = probN, class = 0,
prob_cls = ifelse(probN == class, 1, 0))
df2 <- mutate(xnew, prob = probN, class = 1,
prob_cls = ifelse(probN == class, 1, 0))
bigdf <- bind_rows(df1, df2)
trdf <- as.data.frame(cbind(as.matrix(x[tr,]),as.integer(y[tr])))
names(trdf) <- c("x1","x2","class")
ggplot(bigdf) +
geom_point(aes(x=x1, y =x2, col=class),
data = mutate(xnew, class = as.factor(probN)),
size = .25) +
geom_point(aes(x = x1, y = x2, col = as.factor(class)),
size = 4, shape = 1, data = trdf) +
geom_contour(aes(x = x1, y = x2, z = prob_cls,
group = as.factor(class), color = as.factor(class)),
size = 1, bins = 1, data = bigdf) + theme_bw()
NNp <- NN2predictions(xnew,mm2)
probN <- round(NNp)
library(dplyr)
library(ggplot2)
df1 <- mutate(xnew, prob = probN, class = 0,
prob_cls = ifelse(probN == class, 1, 0))
df2 <- mutate(xnew, prob = probN, class = 1,
prob_cls = ifelse(probN == class, 1, 0))
bigdf <- bind_rows(df1, df2)
trdf <- as.data.frame(cbind(as.matrix(x[tr,]),as.integer(y[tr])))
names(trdf) <- c("x1","x2","class")
ggplot(bigdf) +
geom_point(aes(x=x1, y =x2, col=class),
data = mutate(xnew, class = as.factor(probN)),
size = .25) +
geom_point(aes(x = x1, y = x2, col = as.factor(class)),
size = 4, shape = 1, data = trdf) +
geom_contour(aes(x = x1, y = x2, z = prob_cls,
group = as.factor(class), color = as.factor(class)),
size = 1, bins = 1, data = bigdf) + theme_bw()
For the two-layer network, this training can lead to overfitting. You need to explore size of layers, learning rates, and batch sizes + early stopping.
mmc2<-myNN2CEbatch(x,y,tr,10,10,1,2000,100,1e-2) # batch 100, M=2000
batchs<-100
trs<-length(tr)
plot(mmc2$TR[seq(1, length(mmc2$TR),by=floor(trs/batchs))],type="l",ylim=c(0,1),col=2,xlim=c(0,length(mm$TR)))
lines(mmc2$TE[seq(1,length(mmc2$TR),by=floor(trs/batchs))],lty=2,col=2)
lines(mm2$TR,col=1)
lines(mm2$TE,lty=2,col=1)
# 1-layer network for comparison, in green (batch) and blue (vanilla)
batchs<-100
lines(mmb$TR[seq(1,length(mmb$TR),by=floor(trs/batchs))],lty=1,col=3)
lines(mmb$TE[seq(1,length(mmb$TR),by=floor(trs/batchs))],lty=2,col=3)
lines(mm$TR,col=4)
lines(mm$TE,lty=2,col=4)
NNp <- NN2predictions(xnew,mmc2)
probN <- round(NNp)
library(dplyr)
library(ggplot2)
df1 <- mutate(xnew, prob = probN, class = 0,
prob_cls = ifelse(probN == class, 1, 0))
df2 <- mutate(xnew, prob = probN, class = 1,
prob_cls = ifelse(probN == class, 1, 0))
bigdf <- bind_rows(df1, df2)
trdf <- as.data.frame(cbind(as.matrix(x[tr,]),as.integer(y[tr])))
names(trdf) <- c("x1","x2","class")
ggplot(bigdf) +
geom_point(aes(x=x1, y =x2, col=class),
data = mutate(xnew, class = as.factor(probN)),
size = .25) +
geom_point(aes(x = x1, y = x2, col = as.factor(class)),
size = 4, shape = 1, data = trdf) +
geom_contour(aes(x = x1, y = x2, z = prob_cls,
group = as.factor(class), color = as.factor(class)),
size = 1, bins = 1, data = bigdf) + theme_bw()
NNp <- NN2predictions(xnew,mm2)
probN <- round(NNp)
library(dplyr)
library(ggplot2)
df1 <- mutate(xnew, prob = probN, class = 0,
prob_cls = ifelse(probN == class, 1, 0))
df2 <- mutate(xnew, prob = probN, class = 1,
prob_cls = ifelse(probN == class, 1, 0))
bigdf <- bind_rows(df1, df2)
trdf <- as.data.frame(cbind(as.matrix(x[tr,]),as.integer(y[tr])))
names(trdf) <- c("x1","x2","class")
ggplot(bigdf) +
geom_point(aes(x=x1, y =x2, col=class),
data = mutate(xnew, class = as.factor(probN)),
size = .25) +
geom_point(aes(x = x1, y = x2, col = as.factor(class)),
size = 4, shape = 1, data = trdf) +
geom_contour(aes(x = x1, y = x2, z = prob_cls,
group = as.factor(class), color = as.factor(class)),
size = 1, bins = 1, data = bigdf) + theme_bw()
mmd2<-myNN2CEbatch(x,y,tr,10,10,1,500,25,1e-2) #
batchs<-25
trs<-length(tr)
plot(mmd2$TR[seq(1, length(mmd2$TR),by=floor(trs/batchs))],type="l",ylim=c(0,1),col=2,xlim=c(0,length(mm$TR)))
lines(mmd2$TE[seq(1,length(mmd2$TR),by=floor(trs/batchs))],lty=2,col=2)
lines(mm2$TR,col=1)
lines(mm2$TE,lty=2,col=1)
# 1-layer network for comparison, in green (batch) and blue (vanilla)
batchs<-100
lines(mmb$TR[seq(1,length(mmb$TR),by=floor(trs/batchs))],lty=1,col=3)
lines(mmb$TE[seq(1,length(mmb$TR),by=floor(trs/batchs))],lty=2,col=3)
lines(mm$TR,col=4)
lines(mm$TE,lty=2,col=4)
NNp <- NN2predictions(xnew,mmd2)
probN <- round(NNp)
library(dplyr)
library(ggplot2)
df1 <- mutate(xnew, prob = probN, class = 0,
prob_cls = ifelse(probN == class, 1, 0))
df2 <- mutate(xnew, prob = probN, class = 1,
prob_cls = ifelse(probN == class, 1, 0))
bigdf <- bind_rows(df1, df2)
trdf <- as.data.frame(cbind(as.matrix(x[tr,]),as.integer(y[tr])))
names(trdf) <- c("x1","x2","class")
ggplot(bigdf) +
geom_point(aes(x=x1, y =x2, col=class),
data = mutate(xnew, class = as.factor(probN)),
size = .25) +
geom_point(aes(x = x1, y = x2, col = as.factor(class)),
size = 4, shape = 1, data = trdf) +
geom_contour(aes(x = x1, y = x2, z = prob_cls,
group = as.factor(class), color = as.factor(class)),
size = 1, bins = 1, data = bigdf) + theme_bw()
NNp <- NN2predictions(xnew,mm2)
probN <- round(NNp)
library(dplyr)
library(ggplot2)
df1 <- mutate(xnew, prob = probN, class = 0,
prob_cls = ifelse(probN == class, 1, 0))
df2 <- mutate(xnew, prob = probN, class = 1,
prob_cls = ifelse(probN == class, 1, 0))
bigdf <- bind_rows(df1, df2)
trdf <- as.data.frame(cbind(as.matrix(x[tr,]),as.integer(y[tr])))
names(trdf) <- c("x1","x2","class")
ggplot(bigdf) +
geom_point(aes(x=x1, y =x2, col=class),
data = mutate(xnew, class = as.factor(probN)),
size = .25) +
geom_point(aes(x = x1, y = x2, col = as.factor(class)),
size = 4, shape = 1, data = trdf) +
geom_contour(aes(x = x1, y = x2, z = prob_cls,
group = as.factor(class), color = as.factor(class)),
size = 1, bins = 1, data = bigdf) + theme_bw()
Notice how the smaller batch size acts as a regulization mechanism.