#Sample Data
library(ggplot2) # for graphics
library(MASS) # for maximumum likelihood estimation
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# quick and dirty, a truncated normal distribution to work on the solution set
z <- rnorm(n=3000,mean=0.2)
z <- data.frame(1:3000,z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)
## 'data.frame': 1716 obs. of 2 variables:
## $ ID : int 1 2 5 6 9 10 12 18 20 21 ...
## $ myVar: num 0.081 0.5398 0.5371 0.9065 0.0444 ...
summary(z$myVar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000395 0.359470 0.754801 0.878634 1.282169 3.674911
p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p1)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
normPars <- fitdistr(z$myVar,"normal")
print(normPars)
## mean sd
## 0.87863393 0.64604672
## (0.01559571) (0.01102784)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 0.879 0.646
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 0.0156 0.011
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 0.000243 0 0 0.000122
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 1716
## $ loglik : num -1685
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 0.8786339
meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$myVar),len=length(z$myVar))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
gammaPars <- fitdistr(z$myVar,"gamma")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (`geom_bar()`).
The data chosen was that provided in the example dataset labeled “Effects of plant diversity on resistance of plant biomass”
## 'data.frame': 93 obs. of 16 variables:
## $ ID : chr "1" "2" "3" "4" ...
## $ Moisture.treatment : chr "Ambient" "Ambient" "Ambient" "Ambient" ...
## $ Plant.species.richness : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Soil.moisture..m3.m.3. : num 0.1049 0.1122 0.1098 0.0679 0.105 ...
## $ Soil.organic.matter..g.kg.1.: num 9.39 8.5 12.09 10.97 14.57 ...
## $ Plant.biomass..g. : num 33.2 36.4 39.2 91.3 110 ...
## $ Bacterial.Chao.1 : num 4299 4428 4622 4346 4628 ...
## $ Bacterial.OTU.richness : int 2890 2953 3090 3058 2954 2944 2952 3046 3188 3184 ...
## $ X : chr "" "" "" "" ...
## $ X.1 : chr "" "" "" "" ...
## $ X.2 : chr "" "" "" "" ...
## $ X.3 : chr "" "" "" "" ...
## $ X.4 : chr "" "" "" "" ...
## $ X.5 : chr "" "" "" "" ...
## $ X.6 : chr "" "" "" "" ...
## $ X.7 : chr "" "" "" "" ...
## ID Moisture.treatment Plant.species.richness
## Length:60 Length:60 Min. :1.0
## Class :character Class :character 1st Qu.:1.0
## Mode :character Mode :character Median :2.0
## Mean :3.2
## 3rd Qu.:4.0
## Max. :8.0
## Soil.moisture..m3.m.3. Soil.organic.matter..g.kg.1. Plant.biomass..g.
## Min. :0.02117 Min. : 0.6933 Min. : 7.30
## 1st Qu.:0.03125 1st Qu.: 5.8625 1st Qu.: 37.36
## Median :0.05804 Median :10.3308 Median : 69.89
## Mean :0.06808 Mean : 9.5545 Mean : 68.36
## 3rd Qu.:0.10006 3rd Qu.:12.3793 3rd Qu.: 92.45
## Max. :0.13715 Max. :17.6395 Max. :189.21
## Bacterial.Chao.1 Bacterial.OTU.richness X X.1
## Min. :3706 Min. :2441 Length:60 Length:60
## 1st Qu.:4035 1st Qu.:2762 Class :character Class :character
## Median :4320 Median :2950 Mode :character Mode :character
## Mean :4272 Mean :2904
## 3rd Qu.:4504 3rd Qu.:3058
## Max. :4867 Max. :3198
## X.2 X.3 X.4 X.5
## Length:60 Length:60 Length:60 Length:60
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## X.6 X.7 Plant_Biomass
## Length:60 Length:60 Min. : 7.30
## Class :character Class :character 1st Qu.: 37.36
## Mode :character Mode :character Median : 69.89
## Mean : 68.36
## 3rd Qu.: 92.45
## Max. :189.21
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## mean sd
## 68.356836 37.679639
## ( 4.864420) ( 3.439665)
## List of 5
## $ estimate: Named num [1:2] 68.4 37.7
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 4.86 3.44
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 23.7 0 0 11.8
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 60
## $ loglik : num -303
## - attr(*, "class")= chr "fitdistr"
## mean
## 68.35684
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (`geom_bar()`).
### Results
Based off of the data used, it appears that the best fit for the biomass is that of a normal distribution given that the normal distribution (marked in red) follows the distribution of plant biomasses relatively cleanly.