Association Rules

data <- read.table("../biom_378_sm_010408.txt")
data[,-1] <- lapply(data[,-1], factor) # change variables as factors
head(data)
##   V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12 V13 V14 V15 V16 V17 V18 V19 V20 V21
## 1  1  0  0  0  0  0  0  0  0   1   1   0   0   1   1   0   0   0   0   1   0
## 2  1  0  0  0  0  0  0  0  0   1   1   0   0   1   1   0   0   0   0   1   0
## 3  1  0  0  0  0  0  0  0  0   1   1   0   0   1   1   0   0   0   0   1   0
## 4  1  0  0  0  0  0  0  0  0   0   0   0   0   1   1   0   0   0   0   1   0
## 5  1  0  0  0  0  0  0  0  0   1   0   0   0   1   1   0   0   0   0   0   0
## 6  1  0  0  0  0  0  0  0  0   1   1   0   0   1   1   0   0   0   0   0   0
##   V22 V23 V24 V25 V26 V27 V28 V29 V30 V31 V32 V33 V34 V35 V36 V37 V38 V39 V40
## 1   0   0   0   0   0   1   0   0   0   0   0   1   0   0   0   1   0   0   0
## 2   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   1   0   0   0
## 3   0   0   0   0   0   1   0   0   0   0   0   1   0   0   0   1   0   0   0
## 4   0   0   0   1   0   1   0   0   0   0   0   1   0   0   0   1   0   0   1
## 5   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   1   1   0   0
## 6   0   0   0   0   0   1   0   0   1   0   0   1   0   0   0   1   0   0   0
##   V41 V42 V43 V44 V45 V46 V47 V48 V49 V50 V51 V52 V53 V54 V55 V56 V57 V58 V59
## 1   1   0   0   0   0   0   0   1   0   0   1   0   0   0   0   0   0   0   0
## 2   0   0   0   0   1   0   0   0   0   0   1   0   0   0   1   0   0   0   0
## 3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
## 4   0   0   0   0   0   0   0   1   0   0   1   0   0   0   1   0   0   1   0
## 5   0   0   0   0   0   0   1   0   1   0   1   0   0   0   1   0   0   0   0
## 6   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0
##   V60 V61 V62 V63 V64 V65 V66 V67 V68 V69 V70 V71 V72 V73 V74 V75 V76 V77 V78
## 1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## 2   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## 3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## 4   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## 5   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## 6   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
##   V79 V80 V81 V82 V83 V84 V85
## 1   0   0   0   0   0   0   0
## 2   0   0   0   0   0   0   0
## 3   0   0   0   0   0   0   0
## 4   0   0   1   0   0   0   0
## 5   0   0   1   0   0   0   0
## 6   1   0   1   0   0   0   0
library(arules)
res <- apriori(data = data[-1],
               parameter = list(support = 0.1, confidence = 0.8, # default settings
                                target="rules", minlen=2, maxlen=3))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.8    0.1    1 none FALSE            TRUE       5     0.1      2
##  maxlen target  ext
##       3  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 8 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[168 item(s), 89 transaction(s)] done [0.00s].
## sorting and recoding items ... [104 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.03s].
## writing ... [372749 rule(s)] done [0.07s].
## creating S4 object  ... done [0.05s].

Summary

summary(res) # too many pairs
## set of 372749 rules
## 
## rule length distribution (lhs + rhs):sizes
##      2      3 
##   7670 365079 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   3.000   3.000   2.979   3.000   3.000 
## 
## summary of quality measures:
##     support         confidence        coverage           lift       
##  Min.   :0.1011   Min.   :0.8000   Min.   :0.1011   Min.   :0.8279  
##  1st Qu.:0.3483   1st Qu.:0.9268   1st Qu.:0.3596   1st Qu.:0.9947  
##  Median :0.7640   Median :0.9639   Median :0.8202   Median :0.9990  
##  Mean   :0.6255   Mean   :0.9508   Mean   :0.6597   Mean   :1.0090  
##  3rd Qu.:0.8652   3rd Qu.:0.9877   3rd Qu.:0.9213   3rd Qu.:1.0130  
##  Max.   :0.9888   Max.   :1.0000   Max.   :0.9888   Max.   :3.4231  
##      count      
##  Min.   : 9.00  
##  1st Qu.:31.00  
##  Median :68.00  
##  Mean   :55.67  
##  3rd Qu.:77.00  
##  Max.   :88.00  
## 
## mining info:
##      data ntransactions support confidence
##  data[-1]            89     0.1        0.8
inspect(sort(res, by="confidence")[1:20]) # confidene = 1 이라는 것은 LHS가 포함된 데이터는 무조건 RHS를 포함한다는 의미
##      lhs        rhs     support   confidence coverage  lift     count
## [1]  {V31=1} => {V14=0} 0.1011236 1          0.1011236 1.412698 9    
## [2]  {V31=1} => {V37=0} 0.1011236 1          0.1011236 1.202703 9    
## [3]  {V31=1} => {V20=0} 0.1011236 1          0.1011236 1.171053 9    
## [4]  {V31=1} => {V49=0} 0.1011236 1          0.1011236 1.155844 9    
## [5]  {V31=1} => {V40=0} 0.1011236 1          0.1011236 1.126582 9    
## [6]  {V31=1} => {V4=0}  0.1011236 1          0.1011236 1.085366 9    
## [7]  {V31=1} => {V54=0} 0.1011236 1          0.1011236 1.085366 9    
## [8]  {V31=1} => {V51=0} 0.1011236 1          0.1011236 1.072289 9    
## [9]  {V31=1} => {V6=0}  0.1011236 1          0.1011236 1.072289 9    
## [10] {V31=1} => {V70=0} 0.1011236 1          0.1011236 1.059524 9    
## [11] {V31=1} => {V8=0}  0.1011236 1          0.1011236 1.047059 9    
## [12] {V31=1} => {V71=0} 0.1011236 1          0.1011236 1.047059 9    
## [13] {V31=1} => {V45=0} 0.1011236 1          0.1011236 1.047059 9    
## [14] {V31=1} => {V29=0} 0.1011236 1          0.1011236 1.047059 9    
## [15] {V31=1} => {V23=0} 0.1011236 1          0.1011236 1.047059 9    
## [16] {V31=1} => {V79=0} 0.1011236 1          0.1011236 1.047059 9    
## [17] {V31=1} => {V38=0} 0.1011236 1          0.1011236 1.034884 9    
## [18] {V31=1} => {V57=0} 0.1011236 1          0.1011236 1.034884 9    
## [19] {V31=1} => {V9=0}  0.1011236 1          0.1011236 1.034884 9    
## [20] {V31=1} => {V64=0} 0.1011236 1          0.1011236 1.034884 9
inspect(sort(res, by="support")[1:20])  # support가 높으면 count도 높음
##      lhs        rhs     support   confidence coverage  lift      count
## [1]  {V26=0} => {V24=0} 0.9887640 1.0000000  0.9887640 1.0113636 88   
## [2]  {V24=0} => {V26=0} 0.9887640 1.0000000  0.9887640 1.0113636 88   
## [3]  {V21=0} => {V69=0} 0.9887640 1.0000000  0.9887640 1.0113636 88   
## [4]  {V69=0} => {V21=0} 0.9887640 1.0000000  0.9887640 1.0113636 88   
## [5]  {V84=0} => {V60=0} 0.9887640 1.0000000  0.9887640 1.0113636 88   
## [6]  {V60=0} => {V84=0} 0.9887640 1.0000000  0.9887640 1.0113636 88   
## [7]  {V30=0} => {V21=0} 0.9775281 1.0000000  0.9775281 1.0113636 87   
## [8]  {V21=0} => {V30=0} 0.9775281 0.9886364  0.9887640 1.0113636 87   
## [9]  {V30=0} => {V69=0} 0.9775281 1.0000000  0.9775281 1.0113636 87   
## [10] {V69=0} => {V30=0} 0.9775281 0.9886364  0.9887640 1.0113636 87   
## [11] {V62=0} => {V39=0} 0.9775281 1.0000000  0.9775281 1.0113636 87   
## [12] {V39=0} => {V62=0} 0.9775281 0.9886364  0.9887640 1.0113636 87   
## [13] {V12=0} => {V84=0} 0.9775281 1.0000000  0.9775281 1.0113636 87   
## [14] {V84=0} => {V12=0} 0.9775281 0.9886364  0.9887640 1.0113636 87   
## [15] {V12=0} => {V60=0} 0.9775281 1.0000000  0.9775281 1.0113636 87   
## [16] {V60=0} => {V12=0} 0.9775281 0.9886364  0.9887640 1.0113636 87   
## [17] {V42=0} => {V7=0}  0.9775281 1.0000000  0.9775281 1.0113636 87   
## [18] {V7=0}  => {V42=0} 0.9775281 0.9886364  0.9887640 1.0113636 87   
## [19] {V80=0} => {V26=0} 0.9775281 0.9886364  0.9887640 0.9998709 87   
## [20] {V26=0} => {V80=0} 0.9775281 0.9886364  0.9887640 0.9998709 87

Visualization

  • 점의 크기 : Support
  • 점의 진하기 : Lift
library(arulesViz)

plot(sort(res, by="confidence")[1:30], method = "graph",
     control = list(type="items"), vertex.label.cex=0.7,
     edge.arrow.size=0.3, edge.arrow.width=2)
## Available control parameters (with default values):
## layout    =  list(fun = function (graph, dim = 2, ...)  {     if ("layout" %in% graph_attr_names(graph)) {         lay <- graph_attr(graph, "layout")         if (is.function(lay)) {             lay(graph, ...)         }         else {             lay         }     }     else if (all(c("x", "y") %in% vertex_attr_names(graph))) {         if ("z" %in% vertex_attr_names(graph)) {             cbind(V(graph)$x, V(graph)$y, V(graph)$z)         }         else {             cbind(V(graph)$x, V(graph)$y)         }     }     else if (vcount(graph) < 1000) {         layout_with_fr(graph, dim = dim, ...)     }     else {         layout_with_drl(graph, dim = dim, ...)     } }, call_str = c("layout_nicely(<graph>, input = \"/Users/joshua/Library/CloudStorage/OneDrive-개인/22. 서울대학교/2. 연구/석사논문/project/20211123/arules 복사본.Rmd\", ", "    encoding = \"UTF-8\")"), args = list())
## edges     =  <environment>
## nodes     =  <environment>
## nodetext  =  <environment>
## colors    =  c("#EE0000FF", "#EEEEEEFF")
## engine    =  ggplot2
## max   =  100
## verbose   =  FALSE

plot(sort(res, by="support")[1:30], method = "graph",
     control = list(type="items"), vertex.label.cex=0.7,
     edge.arrow.size=0.3, edge.arrow.width=2)
## Available control parameters (with default values):
## layout    =  list(fun = function (graph, dim = 2, ...)  {     if ("layout" %in% graph_attr_names(graph)) {         lay <- graph_attr(graph, "layout")         if (is.function(lay)) {             lay(graph, ...)         }         else {             lay         }     }     else if (all(c("x", "y") %in% vertex_attr_names(graph))) {         if ("z" %in% vertex_attr_names(graph)) {             cbind(V(graph)$x, V(graph)$y, V(graph)$z)         }         else {             cbind(V(graph)$x, V(graph)$y)         }     }     else if (vcount(graph) < 1000) {         layout_with_fr(graph, dim = dim, ...)     }     else {         layout_with_drl(graph, dim = dim, ...)     } }, call_str = c("layout_nicely(<graph>, input = \"/Users/joshua/Library/CloudStorage/OneDrive-개인/22. 서울대학교/2. 연구/석사논문/project/20211123/arules 복사본.Rmd\", ", "    encoding = \"UTF-8\")"), args = list())
## edges     =  <environment>
## nodes     =  <environment>
## nodetext  =  <environment>
## colors    =  c("#EE0000FF", "#EEEEEEFF")
## engine    =  ggplot2
## max   =  100
## verbose   =  FALSE