I want to prepare data for unsupervised learning with random forest.
The procedure is as follows:
- Take data and add attribute ‘class’ with value 1 to all examples
- Generate synthetic data from original data:
- while you don’t have the same number of examples as in original data build examples:
- sample new attribute value from all values of that attribute in original data
- do that for all attributes and combine them into new example
- while you don’t have the same number of examples as in original data build examples:
- assign to attribute ‘class’ of synthetic data value 2
- bind both data together
At the end it look like this:
... Class
|1
Original |1
Data |1
|1
--------------
|2
Synthetic |2
Data |2
|2
My R code looks like this:
library(gtools) #for smartbind()
sample1 <- function(X) { sample(X, replace=T) }
g1 <- function(dat) { apply(dat,2,sample1) }
data$class <- rep(1, times=nrow(data)) #add attribute 'class' with value 1
synthData<-data.frame(g1(data[,1:ncol(data)])) #generate synthetic data with sampling from data
synthData$class <- rep(2, times=nrow(synthData)) #attribute 'class' is 2
colnames(synthData) <- colnames(data)
newData <- smartbind(data, synthData) #bind the data together
It’s probably obvious that I’m really new to R, but it works – there is just one problem: types of attributes in synthetic data are not the same as in original data. If in original they are nums, now they become factors. How could I preserve same type while generating synthetic data?
Thank you!
Data1 (nums become factors):
structure(list(V2 = c(1.51793, 1.51711, 1.51645, 1.51916, 1.51131
), V3 = c(13.21, 12.89, 13.44, 14.15, 13.69), V4 = c(3.48, 3.62,
3.61, 0, 3.2), V5 = c(1.41, 1.57, 1.54, 2.09, 1.81), V6 = c(72.64,
72.96, 72.39, 72.74, 72.81), V7 = c(0.59, 0.61, 0.66, 0, 1.76
), V8 = c(8.43, 8.11, 8.03, 10.88, 5.43), V9 = c(0, 0, 0, 0,
1.19), V10 = c(0, 0, 0, 0, 0), realClass = structure(c(1L, 2L,
2L, 5L, 6L), .Label = c(“1”, “2”, “3”, “5”, “6”, “7”), class = “factor”)), .Names = c(“V2”,
“V3”, “V4”, “V5”, “V6”, “V7”, “V8”, “V9”, “V10”, “realClass”), row.names = c(27L,
138L, 77L, 183L, 186L), class = “data.frame”)
Data2 (factors become chrs):
structure(list(realClass = structure(c(2L, 2L, 2L, 1L, 2L), .Label = c(“e”,
“p”), class = “factor”), V2 = structure(c(6L, 3L, 4L, 6L, 6L), .Label = c(“b”,
“c”, “f”, “k”, “s”, “x”), class = “factor”), V3 = structure(c(4L,
4L, 3L, 1L, 1L), .Label = c(“f”, “g”, “s”, “y”), class = “factor”),
V4 = structure(c(5L, 5L, 5L, 3L, 4L), .Label = c(“b”, “c”,
“e”, “g”, “n”, “p”, “r”, “u”, “w”, “y”), class = “factor”),
V5 = structure(c(1L, 1L, 1L, 2L, 1L), .Label = c(“f”, “t”
), class = “factor”), V6 = structure(c(3L, 9L, 3L, 6L, 3L
), .Label = c(“a”, “c”, “f”, “l”, “m”, “n”, “p”, “s”, “y”
), class = “factor”), V7 = structure(c(2L, 2L, 2L, 2L, 2L
), .Label = c(“a”, “f”), class = “factor”), V8 = structure(c(1L,
1L, 1L, 1L, 1L), .Label = c(“c”, “w”), class = “factor”),
V9 = structure(c(2L, 2L, 2L, 1L, 1L), .Label = c(“b”, “n”
), class = “factor”), V10 = structure(c(1L, 1L, 1L, 10L,
4L), .Label = c(“b”, “e”, “g”, “h”, “k”, “n”, “o”, “p”, “r”,
“u”, “w”, “y”), class = “factor”), V11 = structure(c(2L,
2L, 2L, 2L, 1L), .Label = c(“e”, “t”), class = “factor”),
V12 = structure(c(NA, NA, NA, 1L, 1L), .Label = c(“b”, “c”,
“e”, “r”), class = “factor”), V13 = structure(c(3L, 2L, 3L,
3L, 2L), .Label = c(“f”, “k”, “s”, “y”), class = “factor”),
V14 = structure(c(3L, 3L, 2L, 3L, 2L), .Label = c(“f”, “k”,
“s”, “y”), class = “factor”), V15 = structure(c(7L, 8L, 7L,
4L, 7L), .Label = c(“b”, “c”, “e”, “g”, “n”, “o”, “p”, “w”,
“y”), class = “factor”), V16 = structure(c(7L, 7L, 8L, 4L,
1L), .Label = c(“b”, “c”, “e”, “g”, “n”, “o”, “p”, “w”, “y”
), class = “factor”), V17 = structure(c(1L, 1L, 1L, 1L, 1L
), .Label = “p”, class = “factor”), V18 = structure(c(3L,
3L, 3L, 3L, 3L), .Label = c(“n”, “o”, “w”, “y”), class = “factor”),
V19 = structure(c(2L, 2L, 2L, 2L, 2L), .Label = c(“n”, “o”,
“t”), class = “factor”), V20 = structure(c(1L, 1L, 1L, 5L,
3L), .Label = c(“e”, “f”, “l”, “n”, “p”), class = “factor”),
V21 = structure(c(8L, 8L, 8L, 4L, 2L), .Label = c(“b”, “h”,
“k”, “n”, “o”, “r”, “u”, “w”, “y”), class = “factor”), V22 = structure(c(5L,
5L, 5L, 5L, 6L), .Label = c(“a”, “c”, “n”, “s”, “v”, “y”), class = “factor”),
V23 = structure(c(3L, 3L, 5L, 1L, 2L), .Label = c(“d”, “g”,
“l”, “m”, “p”, “u”, “w”), class = “factor”)), .Names = c(“realClass”,
“V2”, “V3”, “V4”, “V5”, “V6”, “V7”, “V8”, “V9”, “V10”, “V11”,
“V12”, “V13”, “V14”, “V15”, “V16”, “V17”, “V18”, “V19”, “V20”,
“V21”, “V22”, “V23”), row.names = c(4105L, 6207L, 6696L, 2736L,
3756L), class = “data.frame”)
You can always use this trick to have numeric columns
But I suspect that you have factor variable in your data.frame.
Since
applyreturn a matrix, if you have one factor in your data, all the numeric variable will be coerced to factor too.Here is an example, using toy dataset
This is where the
plyrpackage became useful, since you can control the output (using **ply). But in this case, thecolwisefunction is sufficient