Chapter 4 Exploratory data analysis
4.1 Histogram
Rn
hist(InRn$Rn,
prob = T,
col = "red",
breaks = 10,
main = "Histogram Indoor Radon",
xlab = expression("Rn " * "[Bq" * m^-3 * "]"))
StatDA::qqplot.das(InRn$Rn,
distribution = "norm",
col = 1, envelope = 0.95,
datax = T,
main = "Q-Q plot (InRn)")
## Box-Cox transformation ----
BCT <- MASS::boxcox(InRn$Rn ~ 1, lambda = seq(-1, 1, 1/100))
title("Box-Cox Transformation")
BCT <- as.data.frame(BCT)
# lambda <- BCT[BCT$y == max(BCT$y), ]$x # -0.17
# InRn$BCT <- (InRn$Rn^lambda-1)/lambda
lambda <- 0
InRn$LogRn <- log(InRn$Rn)## Histogram (logRn) ----
hist(InRn$LogRn,
col = "red",
breaks = 30,
prob = T,
main = "Histogram Indoor Radon",
xlab = expression("LogRn " * "[Bq" * m^-3 * "]"))
StatDA::qqplot.das(InRn$LogRn, distribution = "norm", col = 1, envelope = 0.95,datax=T, main = "Q-Q plot (log InRn)")
4.2 Missing data
Imputation of values below the detection limit
## ROS: “ROBUST” IMPUTATION METHOD ----
DL <- 10
InRn_DL <- InRn
InRn_DL$Rn_Cen <- "FALSE"
InRn_DL[InRn_DL$Rn <= DL,]["Rn_Cen"] <- "TRUE"
InRn_DL$Rn_Cen <- as.logical(InRn_DL$Rn_Cen)
ROS <- NADA::ros(InRn_DL$Rn, InRn_DL$Rn_Cen, forwardT = "log")
ROS <- as.data.frame(ROS)
# Replace Dl by the modeled values
InRn_DL[InRn_DL$Rn_Cen == "TRUE",]["Rn"] <- ROS[ROS$censored == "TRUE",]["modeled"]
InRn_DL$LogRn <- log(InRn_DL$Rn)## q-q plots ----
par(mfrow=c(1,2))
StatDA::qqplot.das(InRn$LogRn,
distribution = "norm",
col = 1,
envelope = 0.95,
datax = T,
main = "Original data")
StatDA::qqplot.das(InRn_DL$LogRn,
distribution = "norm",
col = 1,
envelope = 0.95,
datax = T,
main = "After ROS")
mean(InRn$Rn)# [1] 64.1
sd(InRn$Rn) # [1] 126
exp(mean(InRn$LogRn))# [1] 25.7
exp(sd(InRn$LogRn))# [1] 3.68
RL <- 200 # Bq m-3
100*(1 - pnorm(log(RL), mean = mean(InRn$LogRn), sd = sd(InRn$LogRn)))# [1] 5.8
mean(InRn_DL$Rn)# [1] 64.1
sd(InRn_DL$Rn) # [1] 126
exp(mean(InRn_DL$LogRn))# [1] 24.8
exp(sd(InRn_DL$LogRn))# [1] 4.04
100*(1-pnorm(log(RL), mean = mean(InRn_DL$LogRn), sd = sd(InRn_DL$LogRn)))# [1] 6.74
## Histogram (logRn) ----
par(mfrow=c(1,2))
hist(InRn$LogRn,
col = "red",
breaks = 30,
prob = T,
ylim = c(0, 0.5),
main = "Origical data",
xlab = expression("LogRn " * "[Bq" * m^-3 * "]"))
hist(InRn_DL$LogRn,
col = "red",
breaks = 30,
prob = T,
ylim = c(0, 0.5),
main = "After ROS",
xlab = expression("LogRn " * "[Bq" * m^-3 * "]"))
## Histogram, boxplot, q-q plot ----
par(mfrow = c(1,3))
hist(InRn_DL$LogRn,
col = "red",
breaks = 30,
prob = T,
main = "Histogram",
xlab = expression("LogRn " * "[Bq" * m^-3 * "]"))
lines(density(InRn_DL$LogRn), lwd = 1)
boxplot(InRn_DL$LogRn,
notch = TRUE,
col=2,
varwidth = TRUE,
main = "Boxplot",
ylab = "Lognormal transformation",
xlab = expression("LogRn " * "[Bq" * m^-3 * "]"))
StatDA::qqplot.das(InRn_DL$LogRn,
distribution = "norm",
col = 1,
envelope = 0.95,
datax = T,
ylab = "Observed Value",
xlab = "Expected Normal Value",
main = ("Normal Q-Q plot"),
line = "quartiles",
pch = 3,
cex = 0.7,
xaxt = "s")
4.3 Spatial distribution
## Plot InRn measurements in Bq/m3 (with ggplot2) ----
P_Rn <- ggplot() +
geom_sf(data = Grids_10km) +
geom_sf(data = InRn_DL, aes(color = Rn)) +
scale_color_gradient(name = "Bq/m3", low = "blue", high = "red") +
ggtitle("Indoor radon measurements (Simulated)")
P_Rn
## Change intervale in the Rn scale ----
breaks <- c(0, 50, 100, 200, 300, 500, max(InRn_DL$Rn))
InRn_DL <- InRn_DL %>% mutate(brks = cut(Rn, breaks, include.lowest = T, right = F))
cols <- colorRampPalette(c("blue", "red"))(6)
# cols <- terrain.colors(6)
# cols <- heat.colors(6, alpha = 1)
# cols <- colorRampPalette(c("yellow", "red"))(6)
P_Rn_brks <- ggplot() +
geom_sf(data = Grids_10km) +
geom_sf(data = InRn_DL, aes(fill = brks, color = brks)) +
scale_fill_manual(name = "Bq/m3", values = cols, guide = guide_legend(reverse = TRUE)) +
scale_color_manual(name = "Bq/m3", values = cols, guide = guide_legend(reverse = TRUE)) +
ggtitle("Indoor radon measurements (Simulated)")
P_Rn_brks
## Plot if Rn is higher than Reference level (1) or not (0) ----
# Transform InRn to: 1 if Rn >= RL or 0 if Rn < RL ("Case")
RL <- 200 # Bq m-3
InRn_DL <- InRn_DL %>% mutate(Case = as.factor(ifelse(Rn >= 200, yes = 1, no = 0)))
P_Cases <- ggplot() +
geom_sf(data = Grids_10km) +
geom_sf(data = InRn_DL, aes(fill = Case, color = Case)) +
scale_fill_manual( name = "Bq/m3", labels = c("< 200",">= 200"), values = c("lightgreen", "red")) +
scale_color_manual(name = "Bq/m3", labels = c("< 200",">= 200"), values = c("lightgreen", "red")) +
# theme(legend.position = "none") +
ggtitle("Indoor radon measurements (Simulated)")
P_Cases
## Kernel density plots ----
# The resulting density map is “noisier” for small bandwidth (h)
# and “smoother” for large bandwidth (h).
# A rule-of-thumb for an optimal value is h ≈ max(sx, sy)*0.7*n^-0.2
# where n is the number of points,
# and sx and sy the standard deviations of x- and y- coordinates of the points
# See printed version of the EU Atlas for further information (in progress)
# 2.4. Statistics, measurements, maping (part wirtten by P. Bossew)
# All dwelling sampled (e.g. for detecting possible clusters; avoid overplotting)
H <- st_coordinates(InRn_DL)
h <- max(sd(H[,"X"]), sd(H[,"Y"])) * 0.7 * nrow(H)^-0.2
KP_all <- InRn_DL %>%
st_coordinates() %>%
as_tibble() %>%
ggplot() +
geom_sf(data = Grids_10km) +
stat_density_2d(aes(X, Y, fill = ..level.., alpha = ..level..),
h = h,
geom = "polygon") +
scale_fill_distiller(palette = "Spectral") +
theme(legend.position = "none") +
#geom_sf(data = InRn_DL, size = .1) +
ggtitle("Kernel density plots (all data)") +
labs(x = "", y = "")
KP_all
# Only dwellings with InRn > RL (cases == 1)
H <- st_coordinates(filter(InRn_DL, Case == 1))
h <- max(sd(H[,"X"]), sd(H[,"Y"])) * 0.7 * nrow(H)^-0.2
KP_Cases <- InRn_DL %>% filter(Case == 1) %>%
st_coordinates() %>%
as_tibble() %>%
ggplot() +
geom_sf(data = Grids_10km) +
stat_density_2d(aes(X, Y, fill = ..level.., alpha = ..level..),
h = h,
geom = "polygon") +
scale_fill_distiller(palette = "Spectral") +
theme(legend.position = "none") +
geom_sf(data = filter(InRn_DL, Case == 1), size = .1) +
ggtitle("Kernel density plots (InRn >= 200 Bq/m3)") +
labs(x = "", y = "")
KP_Cases
# Only dwellings with InRn < RL (cases)
H <- st_coordinates(filter(InRn_DL, Case == 0))
h <- max(sd(H[,"X"]), sd(H[,"Y"])) * 0.7 * nrow(H)^-0.2
KP_No_Cases <- InRn_DL %>% filter(Case == 0) %>%
st_coordinates() %>%
as_tibble() %>%
ggplot() +
geom_sf(data = Grids_10km) +
stat_density_2d(aes(X, Y, fill = ..level.., alpha = ..level..),
h = h,
geom = "polygon") +
scale_fill_distiller(palette = "Spectral") +
theme(legend.position = "none") +
geom_sf(data = filter(InRn_DL, Case == 0), size = .1) +
ggtitle("Kernel density plots (InRn < 200 Bq/m3)") +
labs(x = "", y = "")
KP_No_Cases
# Plot two (or more) figures in one
library(gridExtra)
grid.arrange(KP_No_Cases, KP_Cases, nrow = 2)
grid.arrange(KP_No_Cases, KP_Cases, nrow = 1)
# InRn vs Geologia ----
P_BG <- ggplot() +
geom_sf(data = Country) +
geom_sf(data = IGME5000, aes(fill = AgeName), colour = NA) +
geom_sf(data = InRn_DL, aes(), colour = 1, cex = 0.8) +
scale_color_gradient(low = "blue", high = "red") +
ggtitle("Geology 1:1M")
P_BG
## Intersect ----
InRn_DL_BG <- st_intersection(InRn_DL, IGME5000)## Boxplots ----
par(mar = c(9,5,3,0.5), oma = c(0, 0.5, 0.5, 0.5), mfrow = c(1,1))
boxplot(LogRn ~ AgeName, InRn_DL_BG, col = 2,
varwidth = TRUE,
notch = T,
las = 2,
ylab = expression("LogRn " * "[Bq" * m^-3 * "]"),
xlab = "",
main = "Geology (AgeName)")
# ANOVA ----
lm_BG <- lm(LogRn ~ AgeName, InRn_DL_BG)
summary(lm_BG)#
# Call:
# lm(formula = LogRn ~ AgeName, data = InRn_DL_BG)
#
# Residuals:
# Min 1Q Median 3Q Max
# -3.952 -0.904 -0.029 0.974 4.048
#
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 2.822 0.104 27.19 <2e-16 ***
# AgeNameEarly Triassic 0.255 0.136 1.87 0.0617 .
# AgeNameLate Devonian 0.517 0.133 3.90 0.0001 ***
# AgeNameLate Jurassic 0.859 0.137 6.28 5e-10 ***
# AgeNameLate Permian -0.247 0.184 -1.34 0.1793
# AgeNameMiddle Jurassic 0.767 0.464 1.65 0.0986 .
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#
# Residual standard error: 1.36 on 994 degrees of freedom
# Multiple R-squared: 0.0613, Adjusted R-squared: 0.0566
# F-statistic: 13 on 5 and 994 DF, p-value: 2.89e-12
anova(lm_BG)# Analysis of Variance Table
#
# Response: LogRn
# Df Sum Sq Mean Sq F value Pr(>F)
# AgeName 5 120 23.92 13 2.9e-12 ***
# Residuals 994 1830 1.84
# ---
# Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1