The code below generates the desired output. However, the lack of vectorizing means it runs very slowly. How can I speed it up?
I’ve put the dput results from part of some indicative data.
Input dputs
-
StandRefinputstructure(list(id = structure(c(43L, 50L, 17L, 45L, 9L, 5L, 49L, 33L, 48L, 39L, 71L, 64L, 44L, 47L, 58L, 24L, 15L, 37L, 14L, 11L, 26L, 57L, 4L, 30L, 72L, 21L, 23L, 60L, 38L, 59L, 29L, 19L, 6L, 46L, 36L, 3L, 63L, 55L, 51L, 35L, 10L, 7L, 16L, 73L, 42L, 52L, 41L, 27L, 25L, 61L, 20L, 70L, 53L, 18L, 31L, 22L, 1L, 8L, 2L, 40L, 65L, 67L, 28L, 56L, 13L, 32L, 54L, 66L, 68L, 34L, 12L, 69L, 62L), .Label = c("ID 1009445", "ID 120763", "ID 133883", "ID 136398", "ID 171850", "ID 192595", "ID 197597", "ID 216406", "ID 21888", "ID 230940", "ID 23777", "ID 282791", "ID 306348", "ID 309745", "ID 326928", "ID 344897", "ID 34974", "ID 350157", "ID 391831", "ID 402479", "ID 43010", "ID 484078", "ID 484697", "ID 537134", "ID 562259", "ID 562455", "ID 567042", "ID 572866", "ID 578945", "ID 595683", "ID 59759", "ID 598460", "ID 603611", "ID 603757", "ID 607991", "ID 60976", "ID 622720", "ID 646989", "ID 656144", "ID 668807", "ID 669435", "ID 720522", "ID 740555", "ID 745499", "ID 746001", "ID 783969", "ID 78979", "ID 792426", "ID 793541", "ID 797860", "ID 806559", "ID 810517", "ID 826054", "ID 837609", "ID 839287", "ID 867918", "ID 869788", "ID 875380", "ID 876870", "ID 882220", "ID 893116", "ID 895909", "ID 899050", "ID 900143", "ID 908100", "ID 912185", "ID 916371", "ID 916620", "ID 957879", "ID 966195", "ID 993247", "ID 998911", "ID 999610"), class = "factor"), region = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), location = c(259090L, 559306L, 2227063L, 2369217L, 4026978L, 4211264L, 4679449L, 5105226L, 5106345L, 5344670L, 5473601L, 5476528L, 5871970L, 6461228L, 6700029L, 6708265L, 7639959L, 9297695L, 10254788L, 10328812L, 11102816L, 11568295L, 11720437L, 12843457L, 14012506L, 14156669L, 14632300L, 14641938L, 15298211L, 15468425L, 15534406L, 16279682L, 16699353L, 17226952L, 17320785L, 269017L, 453097L, 828833L, 954610L, 954842L, 1066378L, 1217332L, 1253530L, 1277716L, 1292857L, 1337952L, 1439657L, 1452989L, 1712345L, 1758035L, 2601630L, 2640359L, 2778095L, 3151129L, 3369931L, 3399080L, 3529525L, 3810217L, 3821120L, 3841588L, 3901557L, 4111633L, 4220440L, 4528632L, 4665450L, 5099307L, 5260242L, 5958770L, 5966356L, 6137405L, 6246065L, 6297231L, 6807949L)), .Names = c("id", "region", "location"), class = "data.frame", row.names = c(NA, -73L)) -
Two sample inputs
Sample 1
structure(list(region = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L),
begin = c(0L, 2259252L, 5092077L, 9158205L, 0L, 135094L,
941813L, 5901391L, 6061324L), finish = c(2259252L, 5092077L,
9158205L, 20463033L, 135094L, 941813L, 5901391L, 6061324L,
7092402L), sed = c(3.98106154985726, 7.51649828394875, 5.15440228627995,
2.67456624889746, 7.54309412557632, 4.17413910385221, 7.47043058509007,
6.13362524658442, 1.00084994221106)), .Names = c("region",
"begin", "finish", "sed"), class = "data.frame", row.names = c(NA,
-9L))
Sample 2
structure(list(region = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L),
begin = c(0L, 2253252L, 7091077L, 9120205L, 0L, 135094L,
941813L, 5901391L, 6061324L), finish = c(2253252L, 7091077L,
9120205L, 17463033L, 135094L, 941813L, 5901391L, 6061324L,
7092402L), sed = c(3.31830840984048, 1.38014704208403, 6.13049140975458,
2.10349875097134, 0.48170587509345, 0.13058713509175, 9.13509713513509,
6.13047153058701, 3.81734081501503)), .Names = c("region",
"begin", "finish", "sed"), class = "data.frame", row.names = c(NA,
-9L))
Unvectorized Code
matchLocationsToRegions <- function(path) {
# get list of data files (around 500 of these; only dput of 2 given: sample262519 and sample252519)
setwd(path,sep="",collapse=NULL)
data_files <- list.files()
# read in template file with complete regional boundaries
standRef <- read.table(paste(path, "StandRef.txt",sep="",collapse=NULL), header=TRUE, sep="\t")
# pre-allocate a df with row dimensions of standRef and num of columns according to num of data files
sediment.df <- as.data.frame(matrix(NA,nrow=nrow(standRef),ncol=length(data_files)))
colnames(sediment.df) <- data_files
rownames(sediment.df) <- standRef[,1]
# create a counter for columns filled
col_counter <- 1
for (file in data_files) {
# read in current, processed data
sample <- read.table(file, header=TRUE, sep="\t")
# pre-allocate vectors for sedimentation data vector
sed <- rep(NA, nrow(standRef))
# create a variable to track end boundary for a particular sample_ID
end_tracker <- 1
index <- unlist(lapply (unique(standRef$region), function(reg) {
reg.filter <- which(standRef$region == reg)
samp.filter <- which(sample$region == reg)
samp.filter[cut(standRef$location[reg.filter],c(0L,sample$finish[samp.filter]),labels=F)]
}))
sed <- sample$sed[index]
# fill in next, unfilled column of relevant df with data from relevant vector
sediment.df[col_counter] <- sed
# update column counter variable
col_counter <- col_counter + 1
}
# save df as a table
write.table(sediment.df,file="samples_sed.txt", row.names=TRUE, sep="\t")
}
Running Rprof showed that
"scan" "read.table" "matchLocationsToRegions" and
"type.convert" "read.table" "matchLocationsToRegions"
predominate runtime. Presumably, there’s a bottleneck by for-looping over this line:
sample <- read.table(file, header=TRUE, sep="\t")
Update: for loop over the regions has been replaced with much faster-executing code (h/t Simon Urbanek). However, the rest is quite slow.
You can remove the loop easily:
However, it is not unlikely that a lot of time is spent in
read.tableso you may consider a) usingscan, b) creating just one file with all samples (e.g. use an extra column to define the sample) so you don’t need to load many files.