Association Rules
data <- read.table("../biom_378_sm_010408.txt")
data[,-1] <- lapply(data[,-1], factor) # change variables as factors
head(data)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21
## 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 1 0
## 2 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 1 0
## 3 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 1 0
## 4 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 1 0
## 5 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 0 0 0
## 6 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0
## V22 V23 V24 V25 V26 V27 V28 V29 V30 V31 V32 V33 V34 V35 V36 V37 V38 V39 V40
## 1 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0
## 2 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0
## 3 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0
## 4 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 1 0 0 1
## 5 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 0
## 6 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0
## V41 V42 V43 V44 V45 V46 V47 V48 V49 V50 V51 V52 V53 V54 V55 V56 V57 V58 V59
## 1 1 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0
## 2 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## 4 0 0 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0
## 5 0 0 0 0 0 0 1 0 1 0 1 0 0 0 1 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
## V60 V61 V62 V63 V64 V65 V66 V67 V68 V69 V70 V71 V72 V73 V74 V75 V76 V77 V78
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 5 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## V79 V80 V81 V82 V83 V84 V85
## 1 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0
## 3 0 0 0 0 0 0 0
## 4 0 0 1 0 0 0 0
## 5 0 0 1 0 0 0 0
## 6 1 0 1 0 0 0 0
library(arules)
res <- apriori(data = data[-1],
parameter = list(support = 0.1, confidence = 0.8, # default settings
target="rules", minlen=2, maxlen=3))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.1 2
## maxlen target ext
## 3 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 8
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[168 item(s), 89 transaction(s)] done [0.00s].
## sorting and recoding items ... [104 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.03s].
## writing ... [372749 rule(s)] done [0.07s].
## creating S4 object ... done [0.05s].
Summary
summary(res) # too many pairs
## set of 372749 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3
## 7670 365079
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 3.000 3.000 2.979 3.000 3.000
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.1011 Min. :0.8000 Min. :0.1011 Min. :0.8279
## 1st Qu.:0.3483 1st Qu.:0.9268 1st Qu.:0.3596 1st Qu.:0.9947
## Median :0.7640 Median :0.9639 Median :0.8202 Median :0.9990
## Mean :0.6255 Mean :0.9508 Mean :0.6597 Mean :1.0090
## 3rd Qu.:0.8652 3rd Qu.:0.9877 3rd Qu.:0.9213 3rd Qu.:1.0130
## Max. :0.9888 Max. :1.0000 Max. :0.9888 Max. :3.4231
## count
## Min. : 9.00
## 1st Qu.:31.00
## Median :68.00
## Mean :55.67
## 3rd Qu.:77.00
## Max. :88.00
##
## mining info:
## data ntransactions support confidence
## data[-1] 89 0.1 0.8
inspect(sort(res, by="confidence")[1:20]) # confidene = 1 이라는 것은 LHS가 포함된 데이터는 무조건 RHS를 포함한다는 의미
## lhs rhs support confidence coverage lift count
## [1] {V31=1} => {V14=0} 0.1011236 1 0.1011236 1.412698 9
## [2] {V31=1} => {V37=0} 0.1011236 1 0.1011236 1.202703 9
## [3] {V31=1} => {V20=0} 0.1011236 1 0.1011236 1.171053 9
## [4] {V31=1} => {V49=0} 0.1011236 1 0.1011236 1.155844 9
## [5] {V31=1} => {V40=0} 0.1011236 1 0.1011236 1.126582 9
## [6] {V31=1} => {V4=0} 0.1011236 1 0.1011236 1.085366 9
## [7] {V31=1} => {V54=0} 0.1011236 1 0.1011236 1.085366 9
## [8] {V31=1} => {V51=0} 0.1011236 1 0.1011236 1.072289 9
## [9] {V31=1} => {V6=0} 0.1011236 1 0.1011236 1.072289 9
## [10] {V31=1} => {V70=0} 0.1011236 1 0.1011236 1.059524 9
## [11] {V31=1} => {V8=0} 0.1011236 1 0.1011236 1.047059 9
## [12] {V31=1} => {V71=0} 0.1011236 1 0.1011236 1.047059 9
## [13] {V31=1} => {V45=0} 0.1011236 1 0.1011236 1.047059 9
## [14] {V31=1} => {V29=0} 0.1011236 1 0.1011236 1.047059 9
## [15] {V31=1} => {V23=0} 0.1011236 1 0.1011236 1.047059 9
## [16] {V31=1} => {V79=0} 0.1011236 1 0.1011236 1.047059 9
## [17] {V31=1} => {V38=0} 0.1011236 1 0.1011236 1.034884 9
## [18] {V31=1} => {V57=0} 0.1011236 1 0.1011236 1.034884 9
## [19] {V31=1} => {V9=0} 0.1011236 1 0.1011236 1.034884 9
## [20] {V31=1} => {V64=0} 0.1011236 1 0.1011236 1.034884 9
inspect(sort(res, by="support")[1:20]) # support가 높으면 count도 높음
## lhs rhs support confidence coverage lift count
## [1] {V26=0} => {V24=0} 0.9887640 1.0000000 0.9887640 1.0113636 88
## [2] {V24=0} => {V26=0} 0.9887640 1.0000000 0.9887640 1.0113636 88
## [3] {V21=0} => {V69=0} 0.9887640 1.0000000 0.9887640 1.0113636 88
## [4] {V69=0} => {V21=0} 0.9887640 1.0000000 0.9887640 1.0113636 88
## [5] {V84=0} => {V60=0} 0.9887640 1.0000000 0.9887640 1.0113636 88
## [6] {V60=0} => {V84=0} 0.9887640 1.0000000 0.9887640 1.0113636 88
## [7] {V30=0} => {V21=0} 0.9775281 1.0000000 0.9775281 1.0113636 87
## [8] {V21=0} => {V30=0} 0.9775281 0.9886364 0.9887640 1.0113636 87
## [9] {V30=0} => {V69=0} 0.9775281 1.0000000 0.9775281 1.0113636 87
## [10] {V69=0} => {V30=0} 0.9775281 0.9886364 0.9887640 1.0113636 87
## [11] {V62=0} => {V39=0} 0.9775281 1.0000000 0.9775281 1.0113636 87
## [12] {V39=0} => {V62=0} 0.9775281 0.9886364 0.9887640 1.0113636 87
## [13] {V12=0} => {V84=0} 0.9775281 1.0000000 0.9775281 1.0113636 87
## [14] {V84=0} => {V12=0} 0.9775281 0.9886364 0.9887640 1.0113636 87
## [15] {V12=0} => {V60=0} 0.9775281 1.0000000 0.9775281 1.0113636 87
## [16] {V60=0} => {V12=0} 0.9775281 0.9886364 0.9887640 1.0113636 87
## [17] {V42=0} => {V7=0} 0.9775281 1.0000000 0.9775281 1.0113636 87
## [18] {V7=0} => {V42=0} 0.9775281 0.9886364 0.9887640 1.0113636 87
## [19] {V80=0} => {V26=0} 0.9775281 0.9886364 0.9887640 0.9998709 87
## [20] {V26=0} => {V80=0} 0.9775281 0.9886364 0.9887640 0.9998709 87
Visualization
- 점의 크기 : Support
- 점의 진하기 : Lift
library(arulesViz)
plot(sort(res, by="confidence")[1:30], method = "graph",
control = list(type="items"), vertex.label.cex=0.7,
edge.arrow.size=0.3, edge.arrow.width=2)
## Available control parameters (with default values):
## layout = list(fun = function (graph, dim = 2, ...) { if ("layout" %in% graph_attr_names(graph)) { lay <- graph_attr(graph, "layout") if (is.function(lay)) { lay(graph, ...) } else { lay } } else if (all(c("x", "y") %in% vertex_attr_names(graph))) { if ("z" %in% vertex_attr_names(graph)) { cbind(V(graph)$x, V(graph)$y, V(graph)$z) } else { cbind(V(graph)$x, V(graph)$y) } } else if (vcount(graph) < 1000) { layout_with_fr(graph, dim = dim, ...) } else { layout_with_drl(graph, dim = dim, ...) } }, call_str = c("layout_nicely(<graph>, input = \"/Users/joshua/Library/CloudStorage/OneDrive-개인/22. 서울대학교/2. 연구/석사논문/project/20211123/arules 복사본.Rmd\", ", " encoding = \"UTF-8\")"), args = list())
## edges = <environment>
## nodes = <environment>
## nodetext = <environment>
## colors = c("#EE0000FF", "#EEEEEEFF")
## engine = ggplot2
## max = 100
## verbose = FALSE

plot(sort(res, by="support")[1:30], method = "graph",
control = list(type="items"), vertex.label.cex=0.7,
edge.arrow.size=0.3, edge.arrow.width=2)
## Available control parameters (with default values):
## layout = list(fun = function (graph, dim = 2, ...) { if ("layout" %in% graph_attr_names(graph)) { lay <- graph_attr(graph, "layout") if (is.function(lay)) { lay(graph, ...) } else { lay } } else if (all(c("x", "y") %in% vertex_attr_names(graph))) { if ("z" %in% vertex_attr_names(graph)) { cbind(V(graph)$x, V(graph)$y, V(graph)$z) } else { cbind(V(graph)$x, V(graph)$y) } } else if (vcount(graph) < 1000) { layout_with_fr(graph, dim = dim, ...) } else { layout_with_drl(graph, dim = dim, ...) } }, call_str = c("layout_nicely(<graph>, input = \"/Users/joshua/Library/CloudStorage/OneDrive-개인/22. 서울대학교/2. 연구/석사논문/project/20211123/arules 복사본.Rmd\", ", " encoding = \"UTF-8\")"), args = list())
## edges = <environment>
## nodes = <environment>
## nodetext = <environment>
## colors = c("#EE0000FF", "#EEEEEEFF")
## engine = ggplot2
## max = 100
## verbose = FALSE
