#Sample Data

library(ggplot2) # for graphics
library(MASS) # for maximumum likelihood estimation
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# quick and dirty, a truncated normal distribution to work on the solution set

z <- rnorm(n=3000,mean=0.2)
z <- data.frame(1:3000,z)
names(z) <- list("ID","myVar")
z <- z[z$myVar>0,]
str(z)

## 'data.frame':    1716 obs. of  2 variables:
##  $ ID   : int  1 2 5 6 9 10 12 18 20 21 ...
##  $ myVar: num  0.081 0.5398 0.5371 0.9065 0.0444 ...

summary(z$myVar)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## 0.000395 0.359470 0.754801 0.878634 1.282169 3.674911

Make a histogram of the data

p1 <- ggplot(data=z, aes(x=myVar, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

print(p1)

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Add a Density curve

p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Get Maximum likely parameters

normPars <- fitdistr(z$myVar,"normal")
print(normPars)

##       mean          sd    
##   0.87863393   0.64604672 
##  (0.01559571) (0.01102784)

str(normPars)

## List of 5
##  $ estimate: Named num [1:2] 0.879 0.646
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 0.0156 0.011
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 0.000243 0 0 0.000122
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 1716
##  $ loglik  : num -1685
##  - attr(*, "class")= chr "fitdistr"

normPars$estimate["mean"] # note structure of getting a named attribute

##      mean 
## 0.8786339

Plot Normal, Exponential, and Uniform Densities

meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]

xval <- seq(0,max(z$myVar),len=length(z$myVar))

 stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$myVar), args = list(mean = meanML, sd = sdML))
 p1 + stat

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

expoPars <- fitdistr(z$myVar,"exponential")
rateML <- expoPars$estimate["rate"]

stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$myVar), args = list(rate=rateML))
 p1 + stat + stat2

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$myVar), args = list(min=min(z$myVar), max=max(z$myVar)))
 p1 + stat + stat2 + stat3

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot Beta and Gamma distribution

gammaPars <- fitdistr(z$myVar,"gamma")

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]

stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$myVar), args = list(shape=shapeML, rate=rateML))
 p1 + stat + stat2 + stat3 + stat4

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

 pSpecial <- ggplot(data=z, aes(x=myVar/(max(myVar + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$myVar/max(z$myVar + 0.1),start=list(shape1=1,shape2=2),"beta")

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]

statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$myVar), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values (`geom_bar()`).

Own Data

The data chosen was that provided in the example dataset labeled “Effects of plant diversity on resistance of plant biomass”

## 'data.frame':    93 obs. of  16 variables:
##  $ ID                          : chr  "1" "2" "3" "4" ...
##  $ Moisture.treatment          : chr  "Ambient" "Ambient" "Ambient" "Ambient" ...
##  $ Plant.species.richness      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Soil.moisture..m3.m.3.      : num  0.1049 0.1122 0.1098 0.0679 0.105 ...
##  $ Soil.organic.matter..g.kg.1.: num  9.39 8.5 12.09 10.97 14.57 ...
##  $ Plant.biomass..g.           : num  33.2 36.4 39.2 91.3 110 ...
##  $ Bacterial.Chao.1            : num  4299 4428 4622 4346 4628 ...
##  $ Bacterial.OTU.richness      : int  2890 2953 3090 3058 2954 2944 2952 3046 3188 3184 ...
##  $ X                           : chr  "" "" "" "" ...
##  $ X.1                         : chr  "" "" "" "" ...
##  $ X.2                         : chr  "" "" "" "" ...
##  $ X.3                         : chr  "" "" "" "" ...
##  $ X.4                         : chr  "" "" "" "" ...
##  $ X.5                         : chr  "" "" "" "" ...
##  $ X.6                         : chr  "" "" "" "" ...
##  $ X.7                         : chr  "" "" "" "" ...

##       ID            Moisture.treatment Plant.species.richness
##  Length:60          Length:60          Min.   :1.0           
##  Class :character   Class :character   1st Qu.:1.0           
##  Mode  :character   Mode  :character   Median :2.0           
##                                        Mean   :3.2           
##                                        3rd Qu.:4.0           
##                                        Max.   :8.0           
##  Soil.moisture..m3.m.3. Soil.organic.matter..g.kg.1. Plant.biomass..g.
##  Min.   :0.02117        Min.   : 0.6933              Min.   :  7.30   
##  1st Qu.:0.03125        1st Qu.: 5.8625              1st Qu.: 37.36   
##  Median :0.05804        Median :10.3308              Median : 69.89   
##  Mean   :0.06808        Mean   : 9.5545              Mean   : 68.36   
##  3rd Qu.:0.10006        3rd Qu.:12.3793              3rd Qu.: 92.45   
##  Max.   :0.13715        Max.   :17.6395              Max.   :189.21   
##  Bacterial.Chao.1 Bacterial.OTU.richness      X                 X.1           
##  Min.   :3706     Min.   :2441           Length:60          Length:60         
##  1st Qu.:4035     1st Qu.:2762           Class :character   Class :character  
##  Median :4320     Median :2950           Mode  :character   Mode  :character  
##  Mean   :4272     Mean   :2904                                                
##  3rd Qu.:4504     3rd Qu.:3058                                                
##  Max.   :4867     Max.   :3198                                                
##      X.2                X.3                X.4                X.5           
##  Length:60          Length:60          Length:60          Length:60         
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##      X.6                X.7            Plant_Biomass   
##  Length:60          Length:60          Min.   :  7.30  
##  Class :character   Class :character   1st Qu.: 37.36  
##  Mode  :character   Mode  :character   Median : 69.89  
##                                        Mean   : 68.36  
##                                        3rd Qu.: 92.45  
##                                        Max.   :189.21

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

##      mean         sd    
##   68.356836   37.679639 
##  ( 4.864420) ( 3.439665)

## List of 5
##  $ estimate: Named num [1:2] 68.4 37.7
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 4.86 3.44
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 23.7 0 0 11.8
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 60
##  $ loglik  : num -303
##  - attr(*, "class")= chr "fitdistr"

##     mean 
## 68.35684

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values (`geom_bar()`).

### Results

Based off of the data used, it appears that the best fit for the biomass is that of a normal distribution given that the normal distribution (marked in red) follows the distribution of plant biomasses relatively cleanly.

Homework_06

Ely P. Remes

2024-02-21