giorgioluciano.github.io - Snippet #3: Functions for simulating data

Example of creating variables using runif and rnorm
Writing a function that wraps all

First of all we use the runif and rnorm to have a look how they work.

library(data.table)
x_min   <- 0
x_max   <- 10   
x_step  <- 0.01

y_mean  <- 0.5
y_sd    <- 0.25
y_min   <- -1
y_max   <- 1   

x       <- seq(x_min,x_max,x_step)
var_random  <- runif(x,y_min,y_max)
var_norm    <- rnorm(x,y_mean,y_sd) 

df  <- data.frame (x,var_random,var_norm)
dt  <- data.table(df)

simpleDataset <- function(number_of_rows,means,sds)
{
l <- length(means)
res <- lapply(seq(1:l),function(x) 
       eval(
       parse(
       text=paste("rnorm(",number_of_rows,",",means[x],",",sds[x],")",sep=""))
       )
       ) 
dat <- data.frame((sapply(res,c)))
id <- rownames(dat)
dat <-  cbind(id=id,dat)
dt <- data.table(dat)
return(dt)
}

Example 1: We simulate the values of the LDL cholesterol of 2 patients in 3 different times. The first one patient (X1) has an average value of 200 of LDL with a standard variation of 2 while the second (X2) has an average of 150 with a standard deviation of 10. Note: All values are expressed in mg/dL

dataset1 <- simpleDataset(3,c(200,180),c(2,10))
dataset1

   id       X1       X2
1:  1 200.1841 167.1265
2:  2 199.9952 200.8787
3:  3 201.2486 187.5768

Example 2: this time we combine runif and simpleDataset. We simulate the values of the LDL cholesterol of 5 patients in 7 different times. The values for each patient are between a min = 100 and a max = 150 with a standard deviation between a min sd = 10 and max sd = 40. We also simulate two time that presents outliers values between a min = 180 and max = 200 and an min sd = 10 and max sd = 40 . We merge the values for each patient (7 times + 2 outliers times) and finally we use the function melt to reshape the dataset.

dat1 <- simpleDataset(number_of_rows=7,
                      means=runif(5,100,150),
                      sds=runif(5,10,40))
outliers <- simpleDataset(number_of_rows=2,
                      means=runif(5,180,200),
                      sds=runif(5,10,40))                 

dat1

   id        X1       X2        X3        X4        X5
1:  1 153.59476 150.6474 119.10800 169.21507 111.45022
2:  2 136.25580 197.2160 109.72680 103.96251 103.52508
3:  3  88.40632 127.6134  94.28173  96.85567 152.28825
4:  4 114.22804 144.0820 169.41607 132.85758 124.59658
5:  5 147.04467 140.8414  75.96130  92.81705 107.48489
6:  6 130.92207 137.8937 130.39466  83.80787 119.87301
7:  7 125.79370 114.3462  56.15695  97.22265  62.44163

outliers

   id       X1       X2       X3       X4       X5
1:  1 192.4697 146.3098 155.0556 179.5488 151.9553
2:  2 195.1163 163.9017 183.5246 189.1330 163.9440

dato     <-rbind(dat1,outliers) 
dt.melt <- melt(dat1, id.vars="id")
colnames(dt.melt) <- c("id","category","var1")
dt.melt$ncat <- as.numeric(dt.melt$category)

dt.melt

    id category      var1 ncat
 1:  1       X1 153.59476    1
 2:  2       X1 136.25580    1
 3:  3       X1  88.40632    1
 4:  4       X1 114.22804    1
 5:  5       X1 147.04467    1
 6:  6       X1 130.92207    1
 7:  7       X1 125.79370    1
 8:  1       X2 150.64741    2
 9:  2       X2 197.21595    2
10:  3       X2 127.61337    2
11:  4       X2 144.08198    2
12:  5       X2 140.84145    2
13:  6       X2 137.89369    2
14:  7       X2 114.34618    2
15:  1       X3 119.10800    3
16:  2       X3 109.72680    3
17:  3       X3  94.28173    3
18:  4       X3 169.41607    3
19:  5       X3  75.96130    3
20:  6       X3 130.39466    3
21:  7       X3  56.15695    3
22:  1       X4 169.21507    4
23:  2       X4 103.96251    4
24:  3       X4  96.85567    4
25:  4       X4 132.85758    4
26:  5       X4  92.81705    4
27:  6       X4  83.80787    4
28:  7       X4  97.22265    4
29:  1       X5 111.45022    5
30:  2       X5 103.52508    5
31:  3       X5 152.28825    5
32:  4       X5 124.59658    5
33:  5       X5 107.48489    5
34:  6       X5 119.87301    5
35:  7       X5  62.44163    5
    id category      var1 ncat

str(dt.melt)

Classes 'data.table' and 'data.frame':  35 obs. of  4 variables:
 $ id      : chr  "1" "2" "3" "4" ...
 $ category: Factor w/ 5 levels "X1","X2","X3",..: 1 1 1 1 1 1 1 2 2 2 ...
 $ var1    : num  153.6 136.3 88.4 114.2 147 ...
 $ ncat    : num  1 1 1 1 1 1 1 2 2 2 ...
 - attr(*, ".internal.selfref")=<externalptr>