-
Fabian Stenzel authoredFabian Stenzel authored
roc.R 11.44 KiB
calc_roc_data <- function(external_binary, #binary transgression: 0 - no,1 - yes
internal_continuous,
external_name,
range_internal, # range of internal data, e.g. c(0,1)
sampling_res = 0.01, # together with range used to create potential thresholds
cellArea = cellarea) {
# check dimensions
internal_continuous <- drop(internal_continuous)
external_binary <- drop(external_binary)
if (length(internal_continuous) != length(external_binary) ||
length(internal_continuous) != length(cellArea)) {
stop("Dimensions of external_binary (", length(external_binary),
"), internal_continuous (", length(internal_continuous),
"), cellArea (", length(cellArea),
") need to match. Aborting.")
}
# controls sampling distance
potential_thresholds <- seq(range_internal[1],range_internal[2],sampling_res)
values <- array(NA, dim=c(length(potential_thresholds),1,3))
dimnames(values) <- list(threshold = potential_thresholds,
indicator = external_name, value = c("TP","FP","slope"))
for (i in seq_along(potential_thresholds)) {
t <- potential_thresholds[i]
internal_cells <- which(internal_continuous > t)
TP <- sum(cellArea[which(external_binary == 1 & internal_continuous > t)]) # True Positive
FP <- sum(cellArea[which(!(external_binary == 1) & internal_continuous > t)]) # False Positive
values[i,1,1] <- TP
values[i,1,2] <- FP
}
#browser()
#plotGlobalManToScreen(data = external_binary, title = "change",
# brks = seq(0,1,0.05),legYes = T,palette = palette_viridis,
# legendtitle = "")
values[,1,1] <- values[,1,1]/max(values[,1,1])
values[,1,2] <- values[,1,2]/max(values[,1,2])
# calculate tangent angle between each pair of values
n <- length(potential_thresholds)
xs <- values[c(1,1:n,n),1,1]
ys <- values[c(1,1:n,n),1,2]
n_e <- length(xs)
slope <- (ys[1:(n_e-2)] - ys[3:n_e]) / (xs[1:(n_e-2)] - xs[3:n_e])
values[,1,3] <- slope
return(values)
}
calc_roc_data_3d <- function( external_continuous,
internal_continuous,
range_internal, # range of internal data, e.g. c(0,1)
range_external, # range of external data, e.g. c(0,1)
sampling = 101, # together with range used to create potential thresholds
cellArea = cellarea) {
# check dimensions
internal_continuous <- drop(internal_continuous)
external_continuous <- drop(external_continuous)
if (length(internal_continuous) != length(external_continuous) ||
length(internal_continuous) != length(cellArea)) {
stop("Dimensions of external_binary (", length(external_continuous),
"), internal_continuous (", length(internal_continuous),
"), cellArea (", length(cellArea),
") need to match. Aborting.")
}
# controls sampling distance
potential_thresholds_internal <- seq(range_internal[1],range_internal[2], length.out = sampling)
potential_thresholds_external <- seq(range_external[1],range_external[2], length.out = sampling)
values <- array(NA, dim=c(length(potential_thresholds_internal),length(potential_thresholds_external),3))
dimnames(values) <- list(threshold_internal = potential_thresholds_internal,
threshold_external = potential_thresholds_external,
value = c("TP","FP","diff"))
for (e in seq_along(potential_thresholds_internal)) {
for (i in seq_along(potential_thresholds_internal)) {
ti <- potential_thresholds_internal[i]
te <- potential_thresholds_external[e]
TP <- sum(cellArea[which(external_continuous > te & internal_continuous > ti)]) # True Positive
FP <- sum(cellArea[which(external_continuous <= te & internal_continuous > ti)]) # False Positive
values[i,e,1] <- TP
values[i,e,2] <- FP
}
}
#values[,,1] <- values[,,1]/max(values[,,1])
#values[,,2] <- values[,,2]/max(values[,,2])
values[,,3] <- values[,,1] - values[,,2]
return(values)
}
roc_plot <- function(filename = NULL,
values # only one metric
) {
if (!is.null(filename)) {
png(filename, res = 300, width = 6, height = 7, units = "in",
pointsize = 6, type = "cairo")
}
par(oma = c(0,0,0,0), mar = c(6,10,5,0), bty = "o", xpd = F,xaxs="i",yaxs="i")
indizes <- dimnames(values)$indicator
auc <- array(NA,dim = c(length(indizes)))
dimnames(auc) <- list(indicator = indizes)
best_fit <- array(NA,dim=c(length(indizes),2))
dimnames(best_fit) <- list(indicator = indizes, type = c("max_diff","slope"))
# Set plot layout
#layout(mat = matrix(c(1,2,3), nrow = 1, ncol = 3, byrow = FALSE),
# heights = c(2,2), # Heights of the rows
# widths = c(1,0.4,0.4)) # Widths of the columns
#layout.show(3)
cex_perc = 2.5
colz <- RColorBrewer::brewer.pal(12,"Set3")[-c(2)]
plot(NA, type = "n", xlab = "FP", ylab = "TP", main = "ROC curve", cex.main = cex_perc,
xlim = c(0,1), ylim = c(0,1), cex.lab = cex_perc, cex.axis = cex_perc, mgp = c(3, 2, 0))
abline(a = 0, b = 1, lty = "dashed", col = "black")
for (ind in indizes) {
indx <- match(ind,indizes)
lines(x = values[,ind,2], y = values[,ind,1], col = colz[indx], lwd = cex_perc)
values[is.na(values[,ind,3]),ind,3] <- 0
best_max_diff <- sort(abs(values[,ind,2] - values[,ind,1]),decreasing = T)[1:5]
best_slope <- sort(abs(values[,ind,3] - 1 ),decreasing = F)[1:5]
closest <- function(x,y){abs((x - y)/sqrt(2))}
best_max_diff_thr <- names(sort(closest(x = values[names(best_max_diff),ind,2],
y = values[names(best_max_diff),ind,1]), decreasing = T)[1])
best_slope_thr <- names(sort(closest(x = values[names(best_slope),ind,2],
y = values[names(best_slope),ind,1]), decreasing = T)[1])
best_fit[ind,] <- c(best_max_diff_thr, best_slope_thr)
points(x = values[best_max_diff_thr,ind,2], y = values[best_max_diff_thr,ind,1],
col = colz[indx], cex = 4, lwd = cex_perc)
# AUC
n <- length(values[,ind,2])
auc[indx] <- sum(abs(values[2:n,ind,2] - values[1:(n - 1),ind,2]) *
values[1:(n - 1),ind,1])
}
legend("bottomright",legend = paste(indizes,round(auc,2)), col = colz,
cex = cex_perc, lwd = cex_perc)
if (!is.null(filename)) dev.off()
return(list(
auc = auc,
optimal_value = best_fit
))
}
roc_plot_paper <- function(filename = NULL,
values # both metric
) {
if (!is.null(filename)) {
png(filename, res = 300, width = 6, height = 7, units = "in",
pointsize = 6, type = "cairo")
}
par(oma = c(0,0,0,0), mar = c(6,10,5,0), bty = "o", xpd = F,xaxs="i",yaxs="i")
vars <- dimnames(values)$metric
indicators <- dimnames(values)$indicator
indizes <- seq_along(indicators)
best_fit <- array(NA,dim=c(length(indicators),2,2))
dimnames(best_fit) <- list(indicator = indicators,
metric = vars, type = c("max_diff","slope"))
auc <- array(NA,dim = c(length(indicators),length(vars)))
dimnames(auc) <- list(indicator = indicators, metric = vars)
# Set plot layout
layout(mat = matrix(c(1,2,3,4,5,6), nrow = 2, ncol = 3, byrow = FALSE),
heights = c(2,2), # Heights of the rows
widths = c(1,0.4,0.4)) # Widths of the columns
layout.show(6)
cex_perc = 2.5
colz <- RColorBrewer::brewer.pal(12,"Set3")[-c(2)]
for (var in vars){
plot(NA, type = "n", xlab = "FP", ylab = "TP", main = "ROC curve", cex.main = cex_perc,
xlim = c(0,1), ylim = c(0,1), cex.lab = cex_perc, cex.axis = cex_perc, mgp = c(3, 2, 0))
if (var == "BioCol") {
mtext(side = 3, at = -0.2, "A", xpd = NA, cex = 1.5, font = 2)
mtext(side = 2, at = 0.5, line = 6, var, xpd = NA, cex = 2, font = 2, srt = 90)
}else{ #var = "EcoRisk"
mtext(side = 3, at = -0.2, "B", xpd = NA, cex = 1.5, font = 2)
mtext(side = 2, at = 0.5, line = 6, var, xpd = NA, cex = 2, font = 2, srt = 90)
}
abline(a = 0, b = 1, lty = "dashed", col = "black")
for (ind in indizes) {
lines(x = values[,var,ind,2], y = values[,var,ind,1], col = colz[ind], lwd = cex_perc)
#points(x = values[,ind,2], y = values[,ind,1])
values[is.na(values[,var,ind,3]),var,ind,3] <- 0
best_max_diff <- sort(abs(values[,var,ind,2] - values[,var,ind,1]),decreasing = T)[1:5]
best_slope <- sort(abs(values[,var,ind,3] - 1 ),decreasing = F)[1:5]
closest <- function(x,y){abs((x - y)/sqrt(2))}
best_max_diff_thr <- names(sort(closest(x = values[names(best_max_diff),var,ind,2],
y = values[names(best_max_diff),var,ind,1]), decreasing = T)[1])
best_slope_thr <- names(sort(closest(x = values[names(best_slope),var,ind,2],
y = values[names(best_slope),var,ind,1]), decreasing = T)[1])
best_fit[ind,match(var,vars),] <- c(best_max_diff_thr, best_slope_thr)
points(x = values[best_max_diff_thr,var,ind,2], y = values[best_max_diff_thr,var,ind,1],
col = colz[ind], cex = 4, lwd = cex_perc)
# AUC
n <- length(values[,var,ind,2])
auc[ind,var] <- sum(abs(values[2:n,var,ind,2] - values[1:(n - 1),var,ind,2]) *
values[1:(n - 1),var,ind,1])
legend("bottomright",legend = indicators, col = colz[indizes],
cex = cex_perc, lwd = cex_perc)
}
}
# optimal thresholds boxplots
biocol_threshold <- boxplot(as.numeric(best_fit[indizes,"BioCol",1]), ylim = c(0,1), cex.main = cex_perc,
main = "optimal\nthreshold", cex.lab = cex_perc, cex.axis = cex_perc)
points(x = rep(0.7, length(indizes)), y = as.numeric(best_fit[indizes,"BioCol",1]),
col = scales::alpha(colz[indizes],1), lwd = 2)
text(x = 1, y = max(biocol_threshold$stats)+0.2, paste0("median:\n",median(biocol_threshold$stats)), cex = cex_perc)
mtext(side = 3, at = 0, "C", xpd = NA, cex = 1.5, font = 2)
ecorisk_threshold <- boxplot(as.numeric(best_fit[indizes,"EcoRisk",1]),
main = "optimal\nthreshold", cex.main = cex_perc,
ylim = c(0,1), cex.lab = cex_perc, cex.axis = cex_perc)
points(x = rep(0.7, length(indizes)), y = as.numeric(best_fit[indizes,"EcoRisk",1]),
col = scales::alpha(colz[indizes],1), lwd = 2)
text(x = 1, y = max(ecorisk_threshold$stats)+0.2, paste0("median:\n", median(ecorisk_threshold$stats)), cex = cex_perc)
mtext(side = 3, at = 0, "D", xpd = NA, cex = 1.5, font = 2)
# AUC boxplots
biocol_auc <- boxplot(auc[indizes,"BioCol"], ylim = c(0,1), main = "AUC", cex.main = cex_perc,
cex.lab = cex_perc, cex.axis = cex_perc)
points(x = rep(0.7, length(indizes)), y = auc[indizes,"BioCol"],
col = scales::alpha(colz[indizes],1), lwd = 2)
mtext(side = 3, at = 0, "E", xpd = NA, cex = 1.5, font = 2)
ecorisk_auc <- boxplot(auc[indizes,"EcoRisk"], main = "AUC", cex.main = cex_perc,
ylim = c(0,1), cex.lab = cex_perc, cex.axis = cex_perc)
points(x = rep(0.7, length(indizes)), y = auc[indizes,"EcoRisk"],
col = scales::alpha(colz[indizes],1), lwd = 2)
mtext(side = 3, at = 0, "F", xpd = NA, cex = 1.5, font = 2)
if (!is.null(filename)) dev.off()
return(list(
auc = auc,
biocol_threshold = biocol_threshold,
ecorisk_threshold = ecorisk_threshold
))
}