Viele Grüße
Code: Alles auswählen
if("cluster" %in% rownames(installed.packages()) == FALSE) { install.packages("cluster") }
if("stringdist" %in% rownames(installed.packages()) == FALSE) { install.packages("stringdist") }
library(cluster)
library(stringdist)
#library(parallel)
#library(plyr)
counter <- 0
trim <- function( x ) {
gsub("(^[[:space:]]+|[[:space:]]+$)", "", x)
}
assignClusterName <- function (clusterIDsVector, segName) {
clusterIDsVector <- sapply(clusterIDsVector, function (x) return(paste0(segName, ".", x)))
return (as.data.frame(clusterIDsVector))
}
clusterStrDist <- function (strDist, df.records, segmentName, outputFile, columns.output, cluster_height) {
if (counter > 0)
cols <- TRUE
else
cols <- FALSE
clusterDendro <- hclust(strDist, method="complete")
clusterIDs <- cutree(clusterDendro, h=cluster_height)
clusterIDs <- assignClusterName(as.vector(clusterIDs), segmentName)
result <- cbind(df.records, clusterIDs)
#write.table(result, file = "clusters_dep_full.csv", sep=";", row.names=FALSE, col.names=FALSE, append=TRUE)
occs <- table(result$clusterIDs)
result$occs <- occs[result$clusterIDs]
result.dubs <- result[result$occs > 1, ]
colout <- c(columns.output, c("clusterIDsVector", "occs"))
write.table(result.dubs[colout], file = outputFile, sep=";", row.names=FALSE, col.names=cols, append=TRUE)
counter <- counter+1
rm(result)
rm(result.dubs)
rm(clusterDendro)
rm(clusterIDs)
}
strdistmatrixSelf <- function (strcmpVector, method, useBytes, weight) {
strdistMatrix <- stringdistmatrix(strcmpVector, strcmpVector, method, useBytes)
strdistMatrix <- strdistMatrix * as.numeric(weight)
return(strdistMatrix)
}
multiColStrcmp <- function (segmentName, df.records, outputFile, cols, colWeights, columns.output , method, useBytes, cluster_height) {
strdistMatrixList <- lapply(cols, function(x) return(strdistmatrixSelf(as.vector(unlist(df.records[[x]])), method=method, useBytes=useBytes, weight=colWeights[match(x, cols)])))
strDistMatrix <- as.matrix(Reduce("+", strdistMatrixList)) / as.numeric((rep(1,length(colWeights)) %*% colWeights))
rownames(strDistMatrix) <- df.records[[1]]
strDist <- as.dist(strDistMatrix)
rm(strdistMatrixList)
rm(strDistMatrix)
clusterStrDist(strDist, df.records, segmentName, outputFile=outputFile, columns.output=columns.output, cluster_height = cluster_height)
rm(strDist)
#gc()
}
startDedub <- function (inputFile, outputFile, columns.match, columns.strcmp, columns.output, column.filter = " ", column.filterValue = " ", columns.strcmp.weights = c(), strcmpfun="jw", cluster_method="complete", cluster_height=0.1) {
# run script for specified file
# U+00A7 means " as quote character
df.input <- read.csv(file=inputFile, head=TRUE, sep=";")
# some type checking
if (typeof(strcmpfun) != "character")
stop(sprintf(paste("Illegal strcmpfun type: %s", typeof(strcmpfun))))
if (!is.data.frame(df.input))
stop(sprintf(paste("Illegal data frame: %s", typeof(df.input))))
if (!length(columns.strcmp > 0))
stop(sprintf("Provide at least one column for fuzzy string matching"))
if (length(columns.strcmp.weights) == 0)
columns.strcmp.weights <- rep(1, length(columns.strcmp))
if (length(columns.strcmp.weights) != length(columns.strcmp))
stop(sprintf("Please provide exactly one weight for each column to for string comparison"))
lapply(columns.strcmp, function (x) df.input[x] <- trim(tolower(df.input[[x]])))
lapply(columns.strcmp, function (x) df.input[x] <- gsub("*", "", df.input[[x]]))
if (!(column.filter == " ")) {
df.input <- df.input[df.input[column.filter]==column.filterValue, ]
}
gc()
if (length(columns.match) > 0) {
#matchingFramesList <- split(df.input, df.input[,columns.match])
df.input <- within(df.input, splitKey <- do.call("paste", c(df.input[columns.match], sep = ".")))
names(df.input)[length(names(df.input))] <- "splitKey"
#setkey(df.input, "splitKey")
idx <- split(seq_len(nrow(df.input)), df.input$splitKey)
# Check for segments without entities
filterList <- lapply(idx, function (x) length(x) > 1)
idx <- idx[unlist(filterList)]
print(length(idx))
rm(filterList)
gc()
matchingFramesList <- lapply(idx, function(x) { return(as.data.frame(df.input[x,])) } )
# Calculate the number of cores
#no_cores <- detectCores() - 1
# Initiate cluster
#cl <- makeCluster(no_cores)
#combine matching columns in unique key
#clusterExport(cl, varlist=c("idx", "df.input"), envir=environment())
#matchingFramesList <- parLapply(cl, idx, function(x) { return(as.data.frame(df.input[x,])) } )
#stopCluster(cl)
#segmentNames <- unique(df.input[columns.match])
}
else {
matchingFramesList <- list(ALL=df.input)
}
segmentNames <- names(matchingFramesList)
lapply(segmentNames, function (x) multiColStrcmp(x, matchingFramesList[[x]], outputFile = outputFile, cols = columns.strcmp, colWeights = columns.strcmp.weights, columns.output = columns.output, method = strcmpfun, useBytes = TRUE, cluster_height = cluster_height))
gc()
return(TRUE)