#install.packages("torch")
#install.packages("text2vec")
#install.packages("sigmoid")
library(torch)
library(text2vec)
library(sigmoid)
# packages that allow for autograd - derivatives from loss functions etc

# 1 hidden layer NN
myNN1CE <- function(x,y,tr,d1_hidden,d_out,M,learning_rate) {
 #                  
 d_in <- dim(x)[2] # input dimension
 #
 # traning and validation set
 xtr <- x[tr,]
 ytr <- y[tr]
 xte <- x[seq(1,length(y))[-tr],]
 yte <- y[seq(1,length(y))[-tr]]
 #                  
 # dimensionality of hidden layer
 # d1_hidden 
 # output dimensionality (number of predicted features)
 # d_out
 #
 # weights connecting input to hidden layer
 w1 <- torch_randn(d_in, d1_hidden, requires_grad = TRUE)
 # weights connecting hidden to output layer
 w2 <- torch_randn(d1_hidden, d_out, requires_grad = TRUE)
 # hidden layer bias (intercept in the linear predictor)
 b1 <- torch_zeros(1, d1_hidden, requires_grad = TRUE)
 # output layer bias (intercept in the linear predictor)
 b2 <- torch_zeros(1, d_out, requires_grad = TRUE)
#######
### training loop ----------------------------------------
# M = number of epochs
lvec <- rep(0,M)
tvec <- rep(0,M)
#                
for (t in 1:M) {
  
  ### -------- Forward pass -------- 
  
  y_tr <- xtr$mm(w1)$add(b1)$relu()$mm(w2)$add(b2)$sigmoid()
  y_te <- xte$mm(w1)$add(b1)$relu()$mm(w2)$add(b2)$sigmoid()
  
  ### -------- Compute loss -------- 
    
  loss <- nnf_binary_cross_entropy(y_tr, ytr)$mean()

  ### trace losses  
  lvec[t] <- loss$item()   # training loss

  tloss <- nnf_binary_cross_entropy(y_te, yte)$mean()
  tvec[t] <- tloss$item()  # test loss
  
  ### -------- Backpropagation --------
  
  # compute gradient of loss w.r.t. all tensors 
  loss$backward()
  
  ### -------- Update weights -------- 
  
  # Wrap in with_no_grad() because this is a part we don't 
  # want to record for automatic gradient computation
   with_no_grad({
     w1 <- w1$sub_(learning_rate * w1$grad)
     w2 <- w2$sub_(learning_rate * w2$grad)
     b1 <- b1$sub_(learning_rate * b1$grad)
     b2 <- b2$sub_(learning_rate * b2$grad)  
     
     # Zero gradients after every pass, as they'd
     # accumulate otherwise when you add to the weight updates
     w1$grad$zero_()
     w2$grad$zero_()
     b1$grad$zero_()
     b2$grad$zero_()  
   })
    }                  
return(list(Train=y_tr,Pred=y_te,TR=lvec,TE=tvec,w1=w1,w2=w2,b1=b1,b2=b2)) }

### 2 hidden layer network
myNN2CE <- function(x,y,tr,d1a_hidden,d1b_hidden,d_out,M,learning_rate) {
# input layer dimension    
d_in <- dim(x)[2]
#                  
xtr <- x[tr,]
ytr <- y[tr]
xte <- x[seq(1,length(y))[-tr],]
yte <- y[seq(1,length(y))[-tr]]
#                  
# dimensionality of hidden layers
# d1a_hidden 
# d1b_hidden 
# output dimensionality (number of predicted features)
# d_out <- 1

# weights connecting input to hidden layer
 w1a <- torch_randn(d_in, d1a_hidden, requires_grad = TRUE)
 w1b <- torch_randn(d1a_hidden, d1b_hidden, requires_grad = TRUE)
# weights connecting hidden to output layer
 w2 <- torch_randn(d1b_hidden, d_out, requires_grad = TRUE)

# hidden layer bias (intercept)
 b1a <- torch_zeros(1, d1a_hidden, requires_grad = TRUE)
 b1b <- torch_zeros(1, d1b_hidden, requires_grad = TRUE)

# output layer bias (intercept)
 b2 <- torch_zeros(1, d_out, requires_grad = TRUE)
                  
#######

### training loop ----------------------------------------

lvec <- rep(0,M)
tvec <- rep(0,M)

                  
for (t in 1:M) {
  
  ### -------- Forward pass --------
  
  y_tr <- xtr$mm(w1a)$add(b1a)$relu()$mm(w1b)$add(b1b)$relu()$mm(w2)$add(b2)$sigmoid() # play with different activation functions here
  y_te <- xte$mm(w1a)$add(b1a)$relu()$mm(w1b)$add(b1b)$relu()$mm(w2)$add(b2)$sigmoid()
  
  ### -------- Compute loss -------- 
  loss <- nnf_binary_cross_entropy(y_tr, ytr)$mean()

    
  #loss <- (y_tr - ytr)$pow(2)$mean() #MSE loss if you want to try this for regression
  lvec[t] <- loss$item()

  tloss <- nnf_binary_cross_entropy(y_te, yte)$mean()
  tvec[t] <- tloss$item()  
  
  ### -------- Backpropagation --------
  
  # compute gradient of loss w.r.t. all tensors with
  # requires_grad = TRUE
  loss$backward()
  
  ### -------- Update weights -------- 
  
  # Wrap in with_no_grad() because this is a part we don't 
  # want to record for automatic gradient computation
   with_no_grad({
     w1a <- w1a$sub_(learning_rate * w1a$grad)
     w1b <- w1b$sub_(learning_rate * w1b$grad)
     w2 <- w2$sub_(learning_rate * w2$grad)
     b1a <- b1a$sub_(learning_rate * b1a$grad)
     b1b <- b1b$sub_(learning_rate * b1b$grad)
     b2 <- b2$sub_(learning_rate * b2$grad)  
     
     # Zero gradients after every pass, as they'd
     # accumulate otherwise
     w1a$grad$zero_()
     w1b$grad$zero_()
     w2$grad$zero_()
     b1a$grad$zero_()
     b1b$grad$zero_()
     b2$grad$zero_()  
   })

}
         
return(list(Train=y_tr,Pred=y_te,TR=lvec,TE=tvec,w1a=w1a,w1b=w1b,w2=w2,b1a=b1a,b1b=b1b,b2=b2)) }

## Simulated data
TwoCircleData <- function(N1,N2,r1,r2,noise) {


# Circle 1
theta1 <- runif(N1, 0, 2 * pi)
x1 <- r1 * cos(theta1) + rnorm(N1,sd=noise)
x2 <- r1 * sin(theta1) + rnorm(N1,sd=noise)
x <- cbind(x1,x2)
y <- rep(0,length(x1))


# Circle 2
theta2 <- runif(N2, 0, 2 * pi)
x1 <- r2 * cos(theta2) + rnorm(N2,sd=noise)
x2 <- r2 * sin(theta2) + rnorm(N2,sd=noise)

x <- rbind(x,cbind(x1,x2))
y <- c(y, rep(1,length(x1)))

return(list(x=x,y=y))
}

### Predicting on new data
NN1predictions <- function(xnew,NN) {
     library(sigmoid)
     w2 <- as.matrix(NN$w2)
     b2 <- as.matrix(NN$b2)
     w1 <- as.matrix(NN$w1)
     b1 <- as.matrix(NN$b1)
     #
     z1 <- as.matrix(xnew)%*%w1
     z1 <- relu(z1 + rep(b1,each=nrow(z1)))
     z2 <- z1%*%w2
     z2 <- sigmoid(z2 + rep(b2,each=nrow(z2)))
     ynew <- z2
    return(ynew)
}

NN2predictions <- function(xnew,NN) {
     library(sigmoid)
     w2 <- as.matrix(NN$w2)
     b2 <- as.matrix(NN$b2)
     w1a <- as.matrix(NN$w1a)
     b1a <- as.matrix(NN$b1a)
     w1b <- as.matrix(NN$w1b)
     b1b <- as.matrix(NN$b1b)
     #
     z1 <- as.matrix(xnew)%*%w1a
     z1 <- relu(z1 + rep(b1a,each=nrow(z1)))
     z1 <- as.matrix(z1)%*%w1b
     z1 <- relu(z1 + rep(b1b,each=nrow(z1)))
     z2 <- z1%*%w2
     z2 <- sigmoid(z2 + rep(b2,each=nrow(z2)))
     ynew <- z2
    return(ynew)
}

#library(torch)
MM <- TwoCircleData(2500,2500,2,3.25,.5)
plot(MM$x,col=MM$y+1)

x <- torch_tensor(MM$x)
y <- torch_tensor(MM$y)



xnew <- (expand.grid(x1 = seq(-5, 5, by = 0.1), 
              x2 = seq(-5, 5, by = 0.1)))

tr <- sample(seq(1,5000),500) # sample sizes 1000, 200, 100

mm<-myNN1CE(x,y,tr,10,1,2000,1e-2)
mm2<-myNN2CE(x,y,tr,10,10,1,2000,1e-2)

plot(mm$TR,type="l",ylim=c(0,1))
lines(mm$TE,lty=2)
lines(mm2$TR,col=2)
lines(mm2$TE,col=2,lty=2)
# dashed are validation, solid training
# red is 2 layer network

NNp <- NN1predictions(xnew,mm)

probN <- round(NNp)

library(dplyr)
library(ggplot2)
df1 <- mutate(xnew, prob = probN, class = 0,  
          prob_cls = ifelse(probN == class, 1, 0))

df2 <- mutate(xnew, prob = probN, class = 1,  
          prob_cls = ifelse(probN == class, 1, 0))
bigdf <- bind_rows(df1, df2)

trdf <- as.data.frame(cbind(as.matrix(x[tr,]),as.integer(y[tr])))
names(trdf) <- c("x1","x2","class")

ggplot(bigdf) + 
   geom_point(aes(x=x1, y =x2, col=class), 
          data = mutate(xnew, class = as.factor(probN)), 
          size = .25) + 
   geom_point(aes(x = x1, y = x2, col = as.factor(class)), 
          size = 4, shape = 1, data = trdf) + 
   geom_contour(aes(x = x1, y = x2, z = prob_cls, 
            group = as.factor(class), color = as.factor(class)), 
        size = 1, bins = 1, data = bigdf) + theme_bw()

NNp <- NN2predictions(xnew,mm2)

probN <- round(NNp)

library(dplyr)
library(ggplot2)
df1 <- mutate(xnew, prob = probN, class = 0,  
          prob_cls = ifelse(probN == class, 1, 0))

df2 <- mutate(xnew, prob = probN, class = 1,  
          prob_cls = ifelse(probN == class, 1, 0))
bigdf <- bind_rows(df1, df2)

trdf <- as.data.frame(cbind(as.matrix(x[tr,]),as.integer(y[tr])))
names(trdf) <- c("x1","x2","class")

ggplot(bigdf) + 
   geom_point(aes(x=x1, y =x2, col=class), 
          data = mutate(xnew, class = as.factor(probN)), 
          size = .25) + 
   geom_point(aes(x = x1, y = x2, col = as.factor(class)), 
          size = 4, shape = 1, data = trdf) + 
   geom_contour(aes(x = x1, y = x2, z = prob_cls, 
            group = as.factor(class), color = as.factor(class)), 
        size = 1, bins = 1, data = bigdf) + theme_bw()

myNN1CEbatch <- function(x,y,tr,d1_hidden,d_out,M,batch,learning_rate) {
# batch is another tuning  parameter - the size of the batch to train on.                  
#
d_in <- dim(x)[2]
#
xtr <- x[tr,]
ytr <- y[tr]
xte <- x[seq(1,length(y))[-tr],]
yte <- y[seq(1,length(y))[-tr]]
# batches for epochs
if (batch!=0) {
batchnumbers <- floor(length(tr)/batch)
B <- batchnumbers
bl <- floor(length(tr)/B)
bins <- rep(seq(1,B),bl)
if (length(bins)<length(tr)) {
    bins<-c(bins,seq(1,length(tr)-length(bins)))}
bins<-sample(bins,length(bins)) }# scramble order of observations in batches
#                  
if (batch==0) { # full batch mode
    B<-1
    bins<-rep(1,length(tr)) }
#####    
# dimensionality of hidden layer
#d1_hidden <- 10
#
# output dimensionality (number of predicted features)
#d_out <- 1

# weights connecting input to hidden layer
 w1 <- torch_randn(d_in, d1_hidden, requires_grad = TRUE)
 # weights connecting hidden to output layer
 w2 <- torch_randn(d1_hidden, d_out, requires_grad = TRUE)

# hidden layer bias (intercept)
 b1 <- torch_zeros(1, d1_hidden, requires_grad = TRUE)

# output layer bias
 b2 <- torch_zeros(1, d_out, requires_grad = TRUE)
                  
#######

### training loop ----------------------------------------

#M<-5000
lvec <- rep(0,M*B)  #batchnumbers*iterations
tvec <- rep(0,M*B)

zz<-0                   
for (t in 1:M) {
    for (bb in (1:B)) {
       zz <- zz+1
       # get data for current batch 
       xtrb <- xtr[bins==bb,]
       ytrb <- ytr[bins==bb]
  
  ### -------- Forward pass -------- 

  y_trb <- xtrb$mm(w1)$add(b1)$relu()$mm(w2)$add(b2)$sigmoid() # update predictions on batch
  y_tr <- xtr$mm(w1)$add(b1)$relu()$mm(w2)$add(b2)$sigmoid() # update the predictions on full training data
  y_te <- xte$mm(w1)$add(b1)$relu()$mm(w2)$add(b2)$sigmoid()
  
  ### -------- Compute loss -------- 
    
  loss <- nnf_binary_cross_entropy(y_trb, ytrb)$mean() # only use the batch for loss computation and gradient updates

    
  #loss <- (y_tr - ytr)$pow(2)$mean()
  lvec[zz] <- loss$item()

  tloss <- nnf_binary_cross_entropy(y_te, yte)$mean()
  tvec[zz] <- tloss$item()  
  
  ### -------- Backpropagation --------
  
  # compute gradient of loss w.r.t. all tensors with
  # requires_grad = TRUE
  loss$backward()
  
  ### -------- Update weights -------- 
  
  # Wrap in with_no_grad() because this is a part we don't 
  # want to record for automatic gradient computation
   with_no_grad({
     w1 <- w1$sub_(learning_rate * w1$grad)
     w2 <- w2$sub_(learning_rate * w2$grad)
     b1 <- b1$sub_(learning_rate * b1$grad)
     b2 <- b2$sub_(learning_rate * b2$grad)  
     
     # Zero gradients after every pass, as they'd
     # accumulate otherwise
     w1$grad$zero_()
     w2$grad$zero_()
     b1$grad$zero_()
     b2$grad$zero_()  
   })
    }}
#y_new <- xnew$mm(w1)$add(b1)$relu()$mm(w2)$add(b2)$sigmoid() 
                  
return(list(Train=y_tr,Pred=y_te,TR=lvec,TE=tvec,w1=w1,w2=w2,b1=b1,b2=b2)) }

mmb<-myNN1CEbatch(x,y,tr,10,1,2000,100,1e-2) # batch size 100 - explore other sizes

batchs<-100
trs<-length(tr)

plot(mmb$TR[seq(1, length(mmb$TR),by=floor(trs/batchs))],type="l",ylim=c(0,1),col=2)
lines(mmb$TE[seq(1,length(mmb$TR),by=floor(trs/batchs))],lty=2,col=2)
lines(mm$TR,col=1)
lines(mm$TE,lty=2,col=1)


NNp <- NN1predictions(xnew,mmb)

probN <- round(NNp)

library(dplyr)
library(ggplot2)
df1 <- mutate(xnew, prob = probN, class = 0,  
          prob_cls = ifelse(probN == class, 1, 0))

df2 <- mutate(xnew, prob = probN, class = 1,  
          prob_cls = ifelse(probN == class, 1, 0))
bigdf <- bind_rows(df1, df2)

trdf <- as.data.frame(cbind(as.matrix(x[tr,]),as.integer(y[tr])))
names(trdf) <- c("x1","x2","class")

ggplot(bigdf) + 
   geom_point(aes(x=x1, y =x2, col=class), 
          data = mutate(xnew, class = as.factor(probN)), 
          size = .25) + 
   geom_point(aes(x = x1, y = x2, col = as.factor(class)), 
          size = 4, shape = 1, data = trdf) + 
   geom_contour(aes(x = x1, y = x2, z = prob_cls, 
            group = as.factor(class), color = as.factor(class)), 
        size = 1, bins = 1, data = bigdf) + theme_bw()

NNp <- NN1predictions(xnew,mm)

probN <- round(NNp)

library(dplyr)
library(ggplot2)
df1 <- mutate(xnew, prob = probN, class = 0,  
          prob_cls = ifelse(probN == class, 1, 0))

df2 <- mutate(xnew, prob = probN, class = 1,  
          prob_cls = ifelse(probN == class, 1, 0))
bigdf <- bind_rows(df1, df2)

trdf <- as.data.frame(cbind(as.matrix(x[tr,]),as.integer(y[tr])))
names(trdf) <- c("x1","x2","class")

ggplot(bigdf) + 
   geom_point(aes(x=x1, y =x2, col=class), 
          data = mutate(xnew, class = as.factor(probN)), 
          size = .25) + 
   geom_point(aes(x = x1, y = x2, col = as.factor(class)), 
          size = 4, shape = 1, data = trdf) + 
   geom_contour(aes(x = x1, y = x2, z = prob_cls, 
            group = as.factor(class), color = as.factor(class)), 
        size = 1, bins = 1, data = bigdf) + theme_bw()

myNN2CEbatch <- function(x,y,tr,d1a_hidden,d1b_hidden,d_out,M,batch, learning_rate) {
    
d_in <- dim(x)[2]
                  
xtr <- x[tr,]
ytr <- y[tr]
xte <- x[seq(1,length(y))[-tr],]
yte <- y[seq(1,length(y))[-tr]]
#       
if (batch!=0) {
batchnumbers <- floor(length(tr)/batch)
B <- batchnumbers
bl <- floor(length(tr)/B)
bins <- rep(seq(1,B),bl)
if (length(bins)<length(tr)) {
    bins<-c(bins,seq(1,length(tr)-length(bins)))}
bins<-sample(bins,length(bins)) }# scramble order of observations in batches
#                  
if (batch==0) { # full batch mode
    B<-1
    bins<-rep(1,length(tr)) }
 #d1a_hidden <- 10
 #d1b_hidden <- 40
 # output dimensionality (number of predicted features)
 #d_out <- 1

 # weights connecting input to hidden layer
 w1a <- torch_randn(d_in, d1a_hidden, requires_grad = TRUE)
 w1b <- torch_randn(d1a_hidden, d1b_hidden, requires_grad = TRUE)
 # weights connecting hidden to output layer
 w2 <- torch_randn(d1b_hidden, d_out, requires_grad = TRUE)

 # hidden layer bias
 b1a <- torch_zeros(1, d1a_hidden, requires_grad = TRUE)
 b1b <- torch_zeros(1, d1b_hidden, requires_grad = TRUE)

 # output layer bias
 b2 <- torch_zeros(1, d_out, requires_grad = TRUE)
                  
#######

### training loop ----------------------------------------

#M<-5000
lvec <- rep(0,M*B)
tvec <- rep(0,M*B)

zz <- 0                  
for (t in 1:M) {
    for (bb in (1:B)) {
       zz <- zz+1
       xtrb <- xtr[bins==bb,]
       ytrb <- ytr[bins==bb]
  ### -------- Forward pass --------

  y_trb <- xtrb$mm(w1a)$add(b1a)$relu()$mm(w1b)$add(b1b)$relu()$mm(w2)$add(b2)$sigmoid()
  y_tr <- xtr$mm(w1a)$add(b1a)$relu()$mm(w1b)$add(b1b)$relu()$mm(w2)$add(b2)$sigmoid()
  y_te <- xte$mm(w1a)$add(b1a)$relu()$mm(w1b)$add(b1b)$relu()$mm(w2)$add(b2)$sigmoid()
  
  ### -------- Compute loss -------- 
  loss <- nnf_binary_cross_entropy(y_trb, ytrb)$mean()

    
  #loss <- (y_tr - ytr)$pow(2)$mean()
  lvec[zz] <- loss$item()

  tloss <- nnf_binary_cross_entropy(y_te, yte)$mean()
  tvec[zz] <- tloss$item()  
  
  ### -------- Backpropagation --------
  
  # compute gradient of loss w.r.t. all tensors with
  # requires_grad = TRUE
  loss$backward()
  
  ### -------- Update weights -------- 
  
  # Wrap in with_no_grad() because this is a part we don't 
  # want to record for automatic gradient computation
   with_no_grad({
     w1a <- w1a$sub_(learning_rate * w1a$grad)
     w1b <- w1b$sub_(learning_rate * w1b$grad)
     w2 <- w2$sub_(learning_rate * w2$grad)
     b1a <- b1a$sub_(learning_rate * b1a$grad)
     b1b <- b1b$sub_(learning_rate * b1b$grad)
     b2 <- b2$sub_(learning_rate * b2$grad)  
     
     # Zero gradients after every pass, as they'd
     # accumulate otherwise
     w1a$grad$zero_()
     w1b$grad$zero_()
     w2$grad$zero_()
     b1a$grad$zero_()
     b1b$grad$zero_()
     b2$grad$zero_()  
   })

}}

#y_new <- xnew$mm(w1a)$add(b1a)$relu()$mm(w1b)$add(b1b)$relu()$mm(w2)$add(b2)$sigmoid()
                  
return(list(Train=y_tr,Pred=y_te,TR=lvec,TE=tvec,w1a=w1a,w1b=w1b,w2=w2,b1a=b1a,b1b=b1b,b2=b2)) }

mmb2<-myNN2CEbatch(x,y,tr,10,10,1,2000,50,1e-2) # batch 50, M=2000

batchs<-50
trs<-length(tr)

plot(mmb2$TR[seq(1, length(mmb2$TR),by=floor(trs/batchs))],type="l",ylim=c(0,1),col=2,xlim=c(0,length(mm$TR)))
lines(mmb2$TE[seq(1,length(mmb2$TR),by=floor(trs/batchs))],lty=2,col=2)
lines(mm2$TR,col=1)
lines(mm2$TE,lty=2,col=1)
# 1-layer network for comparison, in green (batch) and blue (vanilla)
batchs<-100
lines(mmb$TR[seq(1,length(mmb$TR),by=floor(trs/batchs))],lty=1,col=3)
lines(mmb$TE[seq(1,length(mmb$TR),by=floor(trs/batchs))],lty=2,col=3)
lines(mm$TR,col=4)
lines(mm$TE,lty=2,col=4)



NNp <- NN2predictions(xnew,mmb2)

probN <- round(NNp)

library(dplyr)
library(ggplot2)
df1 <- mutate(xnew, prob = probN, class = 0,  
          prob_cls = ifelse(probN == class, 1, 0))

df2 <- mutate(xnew, prob = probN, class = 1,  
          prob_cls = ifelse(probN == class, 1, 0))
bigdf <- bind_rows(df1, df2)

trdf <- as.data.frame(cbind(as.matrix(x[tr,]),as.integer(y[tr])))
names(trdf) <- c("x1","x2","class")

ggplot(bigdf) + 
   geom_point(aes(x=x1, y =x2, col=class), 
          data = mutate(xnew, class = as.factor(probN)), 
          size = .25) + 
   geom_point(aes(x = x1, y = x2, col = as.factor(class)), 
          size = 4, shape = 1, data = trdf) + 
   geom_contour(aes(x = x1, y = x2, z = prob_cls, 
            group = as.factor(class), color = as.factor(class)), 
        size = 1, bins = 1, data = bigdf) + theme_bw()

NNp <- NN2predictions(xnew,mm2)

probN <- round(NNp)

library(dplyr)
library(ggplot2)
df1 <- mutate(xnew, prob = probN, class = 0,  
          prob_cls = ifelse(probN == class, 1, 0))

df2 <- mutate(xnew, prob = probN, class = 1,  
          prob_cls = ifelse(probN == class, 1, 0))
bigdf <- bind_rows(df1, df2)

trdf <- as.data.frame(cbind(as.matrix(x[tr,]),as.integer(y[tr])))
names(trdf) <- c("x1","x2","class")

ggplot(bigdf) + 
   geom_point(aes(x=x1, y =x2, col=class), 
          data = mutate(xnew, class = as.factor(probN)), 
          size = .25) + 
   geom_point(aes(x = x1, y = x2, col = as.factor(class)), 
          size = 4, shape = 1, data = trdf) + 
   geom_contour(aes(x = x1, y = x2, z = prob_cls, 
            group = as.factor(class), color = as.factor(class)), 
        size = 1, bins = 1, data = bigdf) + theme_bw()

mmc2<-myNN2CEbatch(x,y,tr,10,10,1,2000,100,1e-2) # batch 100, M=2000

batchs<-100
trs<-length(tr)

plot(mmc2$TR[seq(1, length(mmc2$TR),by=floor(trs/batchs))],type="l",ylim=c(0,1),col=2,xlim=c(0,length(mm$TR)))
lines(mmc2$TE[seq(1,length(mmc2$TR),by=floor(trs/batchs))],lty=2,col=2)
lines(mm2$TR,col=1)
lines(mm2$TE,lty=2,col=1)
# 1-layer network for comparison, in green (batch) and blue (vanilla)
batchs<-100
lines(mmb$TR[seq(1,length(mmb$TR),by=floor(trs/batchs))],lty=1,col=3)
lines(mmb$TE[seq(1,length(mmb$TR),by=floor(trs/batchs))],lty=2,col=3)
lines(mm$TR,col=4)
lines(mm$TE,lty=2,col=4)



NNp <- NN2predictions(xnew,mmc2)

probN <- round(NNp)

library(dplyr)
library(ggplot2)
df1 <- mutate(xnew, prob = probN, class = 0,  
          prob_cls = ifelse(probN == class, 1, 0))

df2 <- mutate(xnew, prob = probN, class = 1,  
          prob_cls = ifelse(probN == class, 1, 0))
bigdf <- bind_rows(df1, df2)

trdf <- as.data.frame(cbind(as.matrix(x[tr,]),as.integer(y[tr])))
names(trdf) <- c("x1","x2","class")

ggplot(bigdf) + 
   geom_point(aes(x=x1, y =x2, col=class), 
          data = mutate(xnew, class = as.factor(probN)), 
          size = .25) + 
   geom_point(aes(x = x1, y = x2, col = as.factor(class)), 
          size = 4, shape = 1, data = trdf) + 
   geom_contour(aes(x = x1, y = x2, z = prob_cls, 
            group = as.factor(class), color = as.factor(class)), 
        size = 1, bins = 1, data = bigdf) + theme_bw()

NNp <- NN2predictions(xnew,mm2)

probN <- round(NNp)

library(dplyr)
library(ggplot2)
df1 <- mutate(xnew, prob = probN, class = 0,  
          prob_cls = ifelse(probN == class, 1, 0))

df2 <- mutate(xnew, prob = probN, class = 1,  
          prob_cls = ifelse(probN == class, 1, 0))
bigdf <- bind_rows(df1, df2)

trdf <- as.data.frame(cbind(as.matrix(x[tr,]),as.integer(y[tr])))
names(trdf) <- c("x1","x2","class")

ggplot(bigdf) + 
   geom_point(aes(x=x1, y =x2, col=class), 
          data = mutate(xnew, class = as.factor(probN)), 
          size = .25) + 
   geom_point(aes(x = x1, y = x2, col = as.factor(class)), 
          size = 4, shape = 1, data = trdf) + 
   geom_contour(aes(x = x1, y = x2, z = prob_cls, 
            group = as.factor(class), color = as.factor(class)), 
        size = 1, bins = 1, data = bigdf) + theme_bw()

mmd2<-myNN2CEbatch(x,y,tr,10,10,1,500,25,1e-2) #

batchs<-25
trs<-length(tr)

plot(mmd2$TR[seq(1, length(mmd2$TR),by=floor(trs/batchs))],type="l",ylim=c(0,1),col=2,xlim=c(0,length(mm$TR)))
lines(mmd2$TE[seq(1,length(mmd2$TR),by=floor(trs/batchs))],lty=2,col=2)
lines(mm2$TR,col=1)
lines(mm2$TE,lty=2,col=1)
# 1-layer network for comparison, in green (batch) and blue (vanilla)
batchs<-100
lines(mmb$TR[seq(1,length(mmb$TR),by=floor(trs/batchs))],lty=1,col=3)
lines(mmb$TE[seq(1,length(mmb$TR),by=floor(trs/batchs))],lty=2,col=3)
lines(mm$TR,col=4)
lines(mm$TE,lty=2,col=4)



NNp <- NN2predictions(xnew,mmd2)

probN <- round(NNp)

library(dplyr)
library(ggplot2)
df1 <- mutate(xnew, prob = probN, class = 0,  
          prob_cls = ifelse(probN == class, 1, 0))

df2 <- mutate(xnew, prob = probN, class = 1,  
          prob_cls = ifelse(probN == class, 1, 0))
bigdf <- bind_rows(df1, df2)

trdf <- as.data.frame(cbind(as.matrix(x[tr,]),as.integer(y[tr])))
names(trdf) <- c("x1","x2","class")

ggplot(bigdf) + 
   geom_point(aes(x=x1, y =x2, col=class), 
          data = mutate(xnew, class = as.factor(probN)), 
          size = .25) + 
   geom_point(aes(x = x1, y = x2, col = as.factor(class)), 
          size = 4, shape = 1, data = trdf) + 
   geom_contour(aes(x = x1, y = x2, z = prob_cls, 
            group = as.factor(class), color = as.factor(class)), 
        size = 1, bins = 1, data = bigdf) + theme_bw()

NNp <- NN2predictions(xnew,mm2)

probN <- round(NNp)

library(dplyr)
library(ggplot2)
df1 <- mutate(xnew, prob = probN, class = 0,  
          prob_cls = ifelse(probN == class, 1, 0))

df2 <- mutate(xnew, prob = probN, class = 1,  
          prob_cls = ifelse(probN == class, 1, 0))
bigdf <- bind_rows(df1, df2)

trdf <- as.data.frame(cbind(as.matrix(x[tr,]),as.integer(y[tr])))
names(trdf) <- c("x1","x2","class")

ggplot(bigdf) + 
   geom_point(aes(x=x1, y =x2, col=class), 
          data = mutate(xnew, class = as.factor(probN)), 
          size = .25) + 
   geom_point(aes(x = x1, y = x2, col = as.factor(class)), 
          size = 4, shape = 1, data = trdf) + 
   geom_contour(aes(x = x1, y = x2, z = prob_cls, 
            group = as.factor(class), color = as.factor(class)), 
        size = 1, bins = 1, data = bigdf) + theme_bw()

NN from scratch¶