library(ggplot2)
library(GGally)
library(memisc)
## Loading required package: lattice
## Loading required package: MASS
##
## Attaching package: 'memisc'
##
## The following objects are masked from 'package:stats':
##
## contr.sum, contr.treatment, contrasts
##
## The following object is masked from 'package:base':
##
## as.array
library(gridExtra)
## Loading required package: grid
theme_set(theme_classic(12))
#I ignore the first (no name) column of the csv file
#it is used just to number the lines in the file, so it is useless for the analysis
wines=read.csv('wineQualityWhites.csv')[,c(2:13)]
dim(wines)
## [1] 4898 12
names(wines)
## [1] "fixed.acidity" "volatile.acidity" "citric.acid"
## [4] "residual.sugar" "chlorides" "free.sulfur.dioxide"
## [7] "total.sulfur.dioxide" "density" "pH"
## [10] "sulphates" "alcohol" "quality"
str(wines)
## 'data.frame': 4898 obs. of 12 variables:
## $ fixed.acidity : num 7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
## $ volatile.acidity : num 0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
## $ citric.acid : num 0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
## $ residual.sugar : num 20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
## $ chlorides : num 0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
## $ free.sulfur.dioxide : num 45 14 30 47 47 30 30 45 14 28 ...
## $ total.sulfur.dioxide: num 170 132 97 186 186 97 136 170 132 129 ...
## $ density : num 1.001 0.994 0.995 0.996 0.996 ...
## $ pH : num 3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
## $ sulphates : num 0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
## $ alcohol : num 8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
## $ quality : int 6 6 6 6 6 6 6 6 6 6 ...
summary(wines)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700
## Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200
## Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391
## 3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900
## Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :0.00900 Min. : 2.00 Min. : 9.0
## 1st Qu.:0.03600 1st Qu.: 23.00 1st Qu.:108.0
## Median :0.04300 Median : 34.00 Median :134.0
## Mean :0.04577 Mean : 35.31 Mean :138.4
## 3rd Qu.:0.05000 3rd Qu.: 46.00 3rd Qu.:167.0
## Max. :0.34600 Max. :289.00 Max. :440.0
## density pH sulphates alcohol
## Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 8.00
## 1st Qu.:0.9917 1st Qu.:3.090 1st Qu.:0.4100 1st Qu.: 9.50
## Median :0.9937 Median :3.180 Median :0.4700 Median :10.40
## Mean :0.9940 Mean :3.188 Mean :0.4898 Mean :10.51
## 3rd Qu.:0.9961 3rd Qu.:3.280 3rd Qu.:0.5500 3rd Qu.:11.40
## Max. :1.0390 Max. :3.820 Max. :1.0800 Max. :14.20
## quality
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.878
## 3rd Qu.:6.000
## Max. :9.000
The number of variables, the number of attributes and their names correspond to the description in the wineQualityInfo.txt file.
All attributes are decimals except the output variable (quality) which is of type integer.
Most of the attrbutes (except density, PH, alcohol and quality) have a Max value near twice (or greater) the third quantile. Let us plot them to see more.
Let us explore individual variables first
qplot(fixed.acidity, data=wines,binwidth=0.2)
qplot(fixed.acidity, data=wines,binwidth=0.01)+
coord_cartesian(xlim=c(3,12))+
scale_x_continuous(breaks=seq(0,12,1))
## Warning: position_stack requires constant width: output may be incorrect
sort(table(wines$fixed.acidity),decreasing=T)
##
## 6.8 6.6 6.4 6.9 6.7 7 6.5 7.2 7.1 7.4 6.2 6.3 6 7.3 6.1
## 308 290 280 241 236 232 225 206 200 194 192 188 184 178 155
## 7.6 7.5 5.8 5.9 7.7 7.8 5.7 8 7.9 5.6 8.1 8.2 8.3 8.4 8.5
## 153 123 121 103 93 93 88 80 74 71 56 56 52 35 32
## 5.5 5.2 5.4 5.3 8.6 5 5.1 9.2 8.8 9 8.9 8.7 9.4 4.8 9.8
## 31 28 28 27 25 24 23 21 18 17 16 15 11 9 8
## 4.9 9.1 4.7 9.6 9.7 4.4 9.3 10 4.2 6.15 7.15 9.5 9.9 10.3 10.7
## 7 6 5 5 4 3 3 3 2 2 2 2 2 2 2
## 3.8 3.9 4.5 4.6 6.45 10.2 11.8 14.2
## 1 1 1 1 1 1 1 1
Fixed acidity seems to be normally distributed over wines, with most values between 6 and 8. Most of the values have a single decimal digit.
qplot(volatile.acidity, data=wines,binwidth=0.02)+
coord_cartesian(xlim=c(0,1))+
scale_x_continuous(breaks=seq(0,1,.1))
summary(wines$volatile.acidity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0800 0.2100 0.2600 0.2782 0.3200 1.1000
Most wines have volatile acidity between 0.2 and 0.35
qplot(citric.acid, data=wines,binwidth=0.05)+
coord_cartesian(xlim=c(0,1))+
scale_x_continuous(breaks=seq(0,1,.1))
qplot(citric.acid, data=wines,binwidth=0.05, geom = "freqpoly")+
coord_cartesian(xlim=c(0,1))+
scale_x_continuous(breaks=seq(0,1,.1))
summary(wines$citric.acid)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2700 0.3200 0.3342 0.3900 1.6600
Most values of citric acid are between 0.2 and 0.5, with a Median of 0.32. We have wines with no trace of citric acid (Min. :0.0000).
qplot(residual.sugar, data=wines,binwidth=1)+
coord_cartesian(xlim=c(0,25))
summary(wines$residual.sugar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.600 1.700 5.200 6.391 9.900 65.800
There are few wines with residual sugar value less than 1. We then have a peak of values beween 1 and 2, and then the frequency decreases as values grow up to 21. There are some outliers greater than 21.
qplot(chlorides, data=wines,binwidth=0.002)+
coord_cartesian(xlim=c(0,.1))+
scale_x_continuous(breaks=seq(0,.1,.01))
summary(wines$chlorides)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00900 0.03600 0.04300 0.04577 0.05000 0.34600
Chlorides values look like normally distributed with most value between 0.03 and 0.06, and a median of 0.043
qplot(free.sulfur.dioxide, data=wines,binwidth=1)+
coord_cartesian(xlim=c(0,100))+
scale_x_continuous(breaks=seq(0,100,10))
sort(table(wines$free.sulfur.dioxide),decreasing=T)
##
## 29 31 26 35 34 36 24 28 33 25 37 23
## 160 132 129 129 128 127 118 112 112 111 111 110
## 32 41 40 22 38 20 45 27 30 21 47 17
## 109 104 103 102 102 101 101 99 99 93 91 89
## 39 42 19 49 18 15 44 52 14 53 48 46
## 89 86 84 82 80 79 75 72 68 68 66 64
## 50 43 54 16 55 10 13 51 12 61 11 57
## 64 63 61 58 58 55 55 54 51 47 45 44
## 56 59 60 58 8 6 63 9 62 5 7 68
## 42 39 38 37 35 32 30 29 29 25 25 24
## 64 67 66 69 65 4 70 3 73 48.5 75 81
## 23 22 17 17 14 11 11 10 8 7 7 7
## 72 71 74 76 77 44.5 52.5 73.5 78 79.5 83 87
## 6 5 5 5 5 4 4 4 4 4 4 4
## 96 98 108 35.5 41.5 50.5 59.5 60.5 79 82 85 86
## 3 3 3 2 2 2 2 2 2 2 2 2
## 101 105 2 11.5 15.5 19.5 23.5 28.5 30.5 38.5 39.5 40.5
## 2 2 1 1 1 1 1 1 1 1 1 1
## 42.5 43.5 51.5 61.5 64.5 70.5 77.5 80 82.5 88 89 93
## 1 1 1 1 1 1 1 1 1 1 1 1
## 95 97 110 112 118.5 122.5 124 128 131 138.5 146.5 289
## 1 1 1 1 1 1 1 1 1 1 1 1
summary(wines$free.sulfur.dioxide)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 23.00 34.00 35.31 46.00 289.00
Distribution of Free sulfur dioxide also looks normal, with more values between 20 and 50. Also, most values are integers, non-integer values count starting from 7 down.
qplot(density, data=wines,binwidth=0.0005)+
coord_cartesian(xlim=c(0.98,1.01))+
scale_x_continuous(breaks=seq(0.98,1.01,.005))
## Warning: position_stack requires constant width: output may be incorrect
summary(wines$density)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.9871 0.9917 0.9937 0.9940 0.9961 1.0390
Density values are very compact and ranges most between 0.99 and 1
qplot(pH, data=wines,binwidth=0.05)+
coord_cartesian(xlim=c(2.5,4))+
scale_x_continuous(breaks=seq(2.5,4,0.1))
summary(wines$pH)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.720 3.090 3.180 3.188 3.280 3.820
Most wines have pH between 3 and 3.4, with the median at 3.18
qplot(sulphates, data=wines,binwidth=.02,geom = "freqpoly")+
scale_x_continuous(breaks=seq(0.2,1.1,0.1))
summary(wines$sulphates)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.2200 0.4100 0.4700 0.4898 0.5500 1.0800
Most wines have sulphates between 0.35 and 0.6. The median is 0.47 and the mean 0.49
alcohol.plot=qplot(alcohol, data=wines,binwidth=1,fill=I('blue')
,main="Distribution of alcohol in white wines")+
coord_cartesian(xlim=c(8,14.20))
alcohol.plot
summary(wines$alcohol)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 8.00 9.50 10.40 10.51 11.40 14.20
We have few wines with alcohol rate between 8 and 9. The peak of frequency is betwhen 9 an 10. The frequency then decreases with alcohol growing up to 14. Most values are between 9 and 12.
qplot(quality, data=wines, geom='bar')
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
sort(table(wines$quality),decreasing=T)
##
## 6 5 7 8 4 3 9
## 2198 1457 880 175 163 20 5
summary(wines$quality)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 5.000 6.000 5.878 6.000 9.000
Wines quality ranges from 3 to 9. Most them are between 5, 6 and 7, the Median is 6.
Most of the variables(excepting residual sugar, alcohol and quality) seems to be normally distributed! What could be the relationships between them?
First of all, let us get an overview of how any pair of variables can be related
set.seed(2500)
ggpairs(wines[sample.int(nrow(wines),1000),] ,axisLabels='none',
upper=list(params=list(size=3)),
lower=list(params=list(size=I(.5),alpha=I(.2))),
columnLabels=c('FA','VA','CA','RS','CH','FS','TS',
'DE','PH','SU','AL','QU'))
I will first go deeper in the plots that looks linear (Residual sugar vs Density, alcohol vs density). I am also interrested by looking deeper in some other relationships like fixed acidity vs residual sugar and citric acid vs residual sugar. I will end by exploring the relationships between quality and any other variable
qplot(x=residual.sugar,y=density,
data=subset(wines,
residual.sugar<quantile(wines$residual.sugar,.99) &
density<quantile(density,.99)),
alpha=I(.2))+geom_smooth(method='lm')
cor.test(wines$residual.sugar,wines$density)
##
## Pearson's product-moment correlation
##
## data: wines$residual.sugar and wines$density
## t = 107.8749, df = 4896, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8304732 0.8470698
## sample estimates:
## cor
## 0.8389665
Plotting them together and also computing the correlation shoes that density and residual sugar are highly correlated, they will not be used together if we produced a model.
qplot(x=alcohol,y=density,
data=subset(wines,alcohol<quantile(wines$alcohol,.99) &
density<quantile(density,.99)),
alpha=I(.2))+
geom_smooth(method='lm')
cor.test(wines$alcohol,wines$density)
##
## Pearson's product-moment correlation
##
## data: wines$alcohol and wines$density
## t = -87.2549, df = 4896, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.7908646 -0.7689315
## sample estimates:
## cor
## -0.7801376
Again we can see a strong correlation between the two variables, even if our linear model is decreasing in this case. What about alcohol and residual sugar?
qplot(x=alcohol,y=residual.sugar,
data=subset(wines,alcohol<quantile(wines$alcohol,.99) &
residual.sugar<quantile(residual.sugar,.99)),
alpha=I(.2))+
geom_smooth(method='lm')
cor.test(wines$alcohol,wines$residual.sugar)
##
## Pearson's product-moment correlation
##
## data: wines$alcohol and wines$residual.sugar
## t = -35.3209, df = 4896, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.4726723 -0.4280267
## sample estimates:
## cor
## -0.4506312
Difficult to see any linear relationship between the two variables, although both are highly correlated to alcohol
qplot(x=residual.sugar,y=fixed.acidity,
data=subset(wines,
residual.sugar<quantile(wines$residual.sugar,.99) &
fixed.acidity<quantile(fixed.acidity,.99)),
alpha=I(.2))+
geom_smooth(method='lm')
cor.test(wines$residual.sugar,wines$fixed.acidity)
##
## Pearson's product-moment correlation
##
## data: wines$residual.sugar and wines$fixed.acidity
## t = 6.2537, df = 4896, p-value = 4.348e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.06116674 0.11673612
## sample estimates:
## cor
## 0.0890207
The avreage fixed acidity value is almost the same for different residual sugar values
qplot(x=citric.acid,y=residual.sugar,
data=subset(wines,
residual.sugar<quantile(wines$residual.sugar,.99) &
citric.acid<quantile(citric.acid,.99)),
alpha=I(.2))
Again, difficult to see any relationship between the two variables
Let us see how any of the attributes relates to wine quality
fixed.acidity.plot=qplot(x=fixed.acidity, y=quality,
data=wines, alpha=I(.2))+
geom_smooth()
fixed.acidity.plot
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
It might be difficult to see this just by looking at the scatter plot, but adding the smoothed conditional mean shows us that average quality tends to decrease when fixed acidity increases.
Let us do a similar plots for quality and others attributes
plots=list()
nameList=names(wines)
labelList=c('Fixed acidity','Volatile acidity','Citric acid',
'Residual sugar','Chlorides','Free sulfur dioxide',
'Total sulfur dioxide','Density',
'pH','Sulphates','Alcohol')
len=length(labelList)
# add a plot for each variable
for ( i in 1:len){
name=nameList[i]
# exclude outliers from plots
quant=quantile(wines[,name],.99)
plots[[i]]=ggplot(aes_string(x=name, y='quality'),
data=wines[wines[,name]<=quant,])+
geom_point(alpha=I(.1),size=I(.2),color=I('orange'))+
geom_smooth()+
ylab("Quality")+
xlab(labelList[i])
}
#got this on stackovervlow
quality.others.plot=do.call(arrangeGrob,c(plots,ncol=4,
main="Quality vs other attributes"))
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
quality.others.plot
Wine quality seems to increase with alcohool and pH, and decrease with fixed acidity, volatile acidity and density.It is more difficult to appreciate for other attributes. Let us see how it may relate to two or more of these variables.
qplot(x=alcohol,y=quality,color=fixed.acidity,
data=subset(wines,alcohol<quantile(wines$alcohol,.99)&
fixed.acidity<quantile(fixed.acidity,.99)),
geom='jitter')+
scale_color_gradientn(colours =
colorRampPalette(c("blue", "red"))(100))
Difficult to take anything from this heat map. Let us separate fixed acidity and volatile acidity into two buckets first and plot again
#divide a vector in thwo buckets around the median
cutInTwo=function(vector){
return(cut(vector,breaks=c(floor(min(vector)),median(vector),
ceiling((max(vector))))))
}
wines$fixed.acidity.bucket=cutInTwo(wines$fixed.acidity)
#other buckets to be used later in the analysis
wines$volatile.acidity.bucket=cutInTwo(wines$volatile.acidity)
wines$pH.bucket=cutInTwo(wines$pH)
title="Quality vs alcohol and fixed acidity"
quality.alcohol.fixed.acidity.plot=qplot(x=alcohol,y=quality,
color=fixed.acidity.bucket,
geom='jitter',
data=wines,xlab="Alcohol",
ylab="Quality",
main=title)+
scale_color_brewer(type='qual',palette=2,
guide=guide_legend(title="Fixed acidity"))
quality.alcohol.fixed.acidity.plot
Now you can see that for lower values of alcohool(alcohol< 10), better quality wines have higher fixed acidity whereas better quality wines have lower fixed acidity for wines with higher values of alcohol.
quality.alcohol.fixed.acidity.plot+
facet_wrap(~volatile.acidity.bucket)
The pattern observed on the previous plot seems to be kept for differents buckets of volatile acidity
quality.alcohol.plot=qplot(x=alcohol, y=quality, data=wines,
alpha=I(.2), color=I("Orange"),
xlab='Alcohol',ylab="Quality")+
geom_smooth()
quality.alcohol.by.fixed.acidity.plot=quality.alcohol.plot+
facet_wrap(~fixed.acidity.bucket)+
ggtitle("Quality vs alcohol by fixed acidity")
quality.alcohol.by.fixed.acidity.plot
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
The increase of wine quality with alcohol seems more evident for lower values of fixed acidity than for higher ones
quality.alcohol.volatile.acidity.plot=
qplot(x=alcohol,y=quality,
color=volatile.acidity.bucket,
geom='jitter',data=wines)+
scale_color_brewer(type='qual',palette=2)
quality.alcohol.volatile.acidity.plot
Holding alcohol constant, wines with lower volatile acidity seeems to have higher quality. This is more visible for wines with alcohol value lower than 12.
quality.alcohol.by.volatile.acidity.plot=
quality.alcohol.plot+
facet_wrap(~volatile.acidity.bucket)+
ggtitle("Quality vs alcohol by volatile acidity")
quality.alcohol.by.volatile.acidity.plot
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
The increase of wine quality with alcohol seems more evident for higher values of volatile acidity than for lower ones
quality.density.fixed.acidity.plot=
qplot(x=density,y=quality,
color=fixed.acidity.bucket,
geom='jitter',data=wines)+
scale_color_brewer(type='qual',palette=2)
quality.density.fixed.acidity.plot
Let us remove outliers an plot again
wines.density.99=subset(wines,
wines$density<quantile(wines$density,.99))
title="Quality vs density and fixed acidity"
quality.density.fixed.acidity.plot=qplot(x=density,y=quality,
color=fixed.acidity.bucket,
geom='jitter',
data=wines.density.99,
main=title)+
scale_color_brewer(type='qual',palette=2,
guide=guide_legend(title="Fixed acidity"))
quality.density.fixed.acidity.plot
For higher values of density, wines of better quality have higher fixed acidity. It is consistent with what we observed for alcohol since it is highly correlated with density but having a negative correlation coefficient.
quality.density.plot=qplot(x=density, y=quality,
data=wines.density.99, alpha=I(.2))+
geom_smooth()
quality.density.plot+
facet_wrap(~fixed.acidity.bucket)
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
The decrease of wine quality with density is visible for both higher and lower values of fixed acidity.
quality.density.volatile.acidity.plot=
qplot(x=density,y=quality,
color=volatile.acidity.bucket,
geom='jitter',data=wines.density.99)+
scale_color_brewer(type='qual',palette=2)
quality.density.volatile.acidity.plot
For higher values of density, wines of better quality have lower volatile acidity.
quality.density.plot+
facet_wrap(~volatile.acidity.bucket)
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
For both higher and lower values of volatile acidity, wine quality globally decreases with density
alcohol.plot+
xlab("Alcohol")+
ylab("Number of wines")
We have few wines with alcohol rate between 8 and 9. The peak of frequency is betwhen 9 an 10. The frequency then decreases with alcohol growing up to 14. Most values are between 9 and 12. Alcohol is one of the variables(with residual sugar and quality)that does not seem to be normally distributed
title="Comparing the combined effect of alcohol or density with fixed
acidity on wine quality"
grid.arrange(quality.alcohol.fixed.acidity.plot,
quality.density.fixed.acidity.plot,
ncol=1, main=title)
Most of the wines of high quality and low alcohol, as well as those of high quality and high density are of high fixed acidity. This is consistant with the correlation between alcohol and density.
grid.arrange(quality.alcohol.by.fixed.acidity.plot,
quality.alcohol.by.volatile.acidity.plot,ncol=1)
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
## geom_smooth: method="auto" and size of largest group is >=1000, so using gam with formula: y ~ s(x, bs = "cs"). Use 'method = x' to change the smoothing method.
The increase of wine quality with alcohol seems more evident for lower values of fixed acidity and higher values of volatile acidity.
My exploration of the white whines data set revealed some interesting findings.The data is made of 4898 entries, each one having 11 chemical properties of a white wine and the quality, as the output attribute, graded by wine experts.All the attibutes ara numerical.
After summarising the data, I went through individual attributes to understand their distributions and I found that most of them seemed to be normally distributed. Going through pair relationships, I found two pairs of highly correlated attritutes, namely “Density-Alchohol” and “Residual sugar-Alcohol”. It may not be interesting to use them together when building a model.
Plotting them against wine quality (with a smooth curve to help me see trends) I found that the output variable is mainly influenced (positively or negatively) by alcohol, pH, fixed acidity, volatile acidity, and density. I then went through some combinations of theses variables to undersdant how they relate to wine quality. Since they were all numerical, I first started with a heat map, but it appears difficult for me to have insight from that. I then transformed some of the variables by dividing them into two buckets around their medians and this helped me to gain more findings.
I unfortunately have not found any linear (or near linear) relationship between wine quality and any attribute, combination or transformation of attributes, so I preffered not to build a linear model for now. I thing that several model types should be considered and compared to have the best prediction of wine quality. Let me take the “Intro to Machine Learning” course before reworking on this!