Hilfe beim verstehen eines Skripts
Verfasst: Mi Nov 14, 2018 10:20 am
Hallo Zusammen, ich sitze seit einigen Tagen an einem geschrieben Skript und würde das gerne deuten. Dieses Skript läuft im Hintergrund eines Tools und soll bei Eingabe einer Datei und gesetzten Filtern, doppelte Daten ausfindig machen. Ich würde mich über Hilfe, bzw. auch gerne PN freuen.
Viele Grüße
Viele Grüße
Code: Alles auswählen
if("cluster" %in% rownames(installed.packages()) == FALSE) { install.packages("cluster") }
if("stringdist" %in% rownames(installed.packages()) == FALSE) { install.packages("stringdist") }
library(cluster)
library(stringdist)
#library(parallel)
#library(plyr)
counter <- 0
trim <- function( x ) {
gsub("(^[[:space:]]+|[[:space:]]+$)", "", x)
}
assignClusterName <- function (clusterIDsVector, segName) {
clusterIDsVector <- sapply(clusterIDsVector, function (x) return(paste0(segName, ".", x)))
return (as.data.frame(clusterIDsVector))
}
clusterStrDist <- function (strDist, df.records, segmentName, outputFile, columns.output, cluster_height) {
if (counter > 0)
cols <- TRUE
else
cols <- FALSE
clusterDendro <- hclust(strDist, method="complete")
clusterIDs <- cutree(clusterDendro, h=cluster_height)
clusterIDs <- assignClusterName(as.vector(clusterIDs), segmentName)
result <- cbind(df.records, clusterIDs)
#write.table(result, file = "clusters_dep_full.csv", sep=";", row.names=FALSE, col.names=FALSE, append=TRUE)
occs <- table(result$clusterIDs)
result$occs <- occs[result$clusterIDs]
result.dubs <- result[result$occs > 1, ]
colout <- c(columns.output, c("clusterIDsVector", "occs"))
write.table(result.dubs[colout], file = outputFile, sep=";", row.names=FALSE, col.names=cols, append=TRUE)
counter <- counter+1
rm(result)
rm(result.dubs)
rm(clusterDendro)
rm(clusterIDs)
}
strdistmatrixSelf <- function (strcmpVector, method, useBytes, weight) {
strdistMatrix <- stringdistmatrix(strcmpVector, strcmpVector, method, useBytes)
strdistMatrix <- strdistMatrix * as.numeric(weight)
return(strdistMatrix)
}
multiColStrcmp <- function (segmentName, df.records, outputFile, cols, colWeights, columns.output , method, useBytes, cluster_height) {
strdistMatrixList <- lapply(cols, function(x) return(strdistmatrixSelf(as.vector(unlist(df.records[[x]])), method=method, useBytes=useBytes, weight=colWeights[match(x, cols)])))
strDistMatrix <- as.matrix(Reduce("+", strdistMatrixList)) / as.numeric((rep(1,length(colWeights)) %*% colWeights))
rownames(strDistMatrix) <- df.records[[1]]
strDist <- as.dist(strDistMatrix)
rm(strdistMatrixList)
rm(strDistMatrix)
clusterStrDist(strDist, df.records, segmentName, outputFile=outputFile, columns.output=columns.output, cluster_height = cluster_height)
rm(strDist)
#gc()
}
startDedub <- function (inputFile, outputFile, columns.match, columns.strcmp, columns.output, column.filter = " ", column.filterValue = " ", columns.strcmp.weights = c(), strcmpfun="jw", cluster_method="complete", cluster_height=0.1) {
# run script for specified file
# U+00A7 means " as quote character
df.input <- read.csv(file=inputFile, head=TRUE, sep=";")
# some type checking
if (typeof(strcmpfun) != "character")
stop(sprintf(paste("Illegal strcmpfun type: %s", typeof(strcmpfun))))
if (!is.data.frame(df.input))
stop(sprintf(paste("Illegal data frame: %s", typeof(df.input))))
if (!length(columns.strcmp > 0))
stop(sprintf("Provide at least one column for fuzzy string matching"))
if (length(columns.strcmp.weights) == 0)
columns.strcmp.weights <- rep(1, length(columns.strcmp))
if (length(columns.strcmp.weights) != length(columns.strcmp))
stop(sprintf("Please provide exactly one weight for each column to for string comparison"))
lapply(columns.strcmp, function (x) df.input[x] <- trim(tolower(df.input[[x]])))
lapply(columns.strcmp, function (x) df.input[x] <- gsub("*", "", df.input[[x]]))
if (!(column.filter == " ")) {
df.input <- df.input[df.input[column.filter]==column.filterValue, ]
}
gc()
if (length(columns.match) > 0) {
#matchingFramesList <- split(df.input, df.input[,columns.match])
df.input <- within(df.input, splitKey <- do.call("paste", c(df.input[columns.match], sep = ".")))
names(df.input)[length(names(df.input))] <- "splitKey"
#setkey(df.input, "splitKey")
idx <- split(seq_len(nrow(df.input)), df.input$splitKey)
# Check for segments without entities
filterList <- lapply(idx, function (x) length(x) > 1)
idx <- idx[unlist(filterList)]
print(length(idx))
rm(filterList)
gc()
matchingFramesList <- lapply(idx, function(x) { return(as.data.frame(df.input[x,])) } )
# Calculate the number of cores
#no_cores <- detectCores() - 1
# Initiate cluster
#cl <- makeCluster(no_cores)
#combine matching columns in unique key
#clusterExport(cl, varlist=c("idx", "df.input"), envir=environment())
#matchingFramesList <- parLapply(cl, idx, function(x) { return(as.data.frame(df.input[x,])) } )
#stopCluster(cl)
#segmentNames <- unique(df.input[columns.match])
}
else {
matchingFramesList <- list(ALL=df.input)
}
segmentNames <- names(matchingFramesList)
lapply(segmentNames, function (x) multiColStrcmp(x, matchingFramesList[[x]], outputFile = outputFile, cols = columns.strcmp, colWeights = columns.strcmp.weights, columns.output = columns.output, method = strcmpfun, useBytes = TRUE, cluster_height = cluster_height))
gc()
return(TRUE)