-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_integration_and_mining_with_graphs-kernels.html
214 lines (214 loc) · 179 KB
/
data_integration_and_mining_with_graphs-kernels.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN" "http://www.w3.org/Math/DTD/mathml2/xhtml-math11-f.dtd">
<html xmlns="http://www.w3.org/1999/xhtml"><!--This file was converted to xhtml by LibreOffice - see http://cgit.freedesktop.org/libreoffice/core/tree/filter/source/xslt for the code.--><head profile="http://dublincore.org/documents/dcmi-terms/"><meta http-equiv="Content-Type" content="application/xhtml+xml; charset=utf-8"/><title xml:lang="en-US">- no title specified</title><meta name="DCTERMS.title" content="" xml:lang="en-US"/><meta name="DCTERMS.language" content="en-US" scheme="DCTERMS.RFC4646"/><meta name="DCTERMS.source" content="http://xml.openoffice.org/odf2xhtml"/><meta name="DCTERMS.creator" content="JeanKarim Heriche"/><meta name="DCTERMS.issued" content="2015-12-01T09:33:37.349441049" scheme="DCTERMS.W3CDTF"/><meta name="DCTERMS.modified" content="2015-12-17T21:05:38.828023000" scheme="DCTERMS.W3CDTF"/><meta name="DCTERMS.provenance" content="" xml:lang="en-US"/><meta name="DCTERMS.subject" content="," xml:lang="en-US"/><link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" hreflang="en"/><link rel="schema.DCTERMS" href="http://purl.org/dc/terms/" hreflang="en"/><link rel="schema.DCTYPE" href="http://purl.org/dc/dcmitype/" hreflang="en"/><link rel="schema.DCAM" href="http://purl.org/dc/dcam/" hreflang="en"/><style type="text/css">
@page { }
table { border-collapse:collapse; border-spacing:0; empty-cells:show }
td, th { vertical-align:top; font-size:12pt;}
h1, h2, h3, h4, h5, h6 { clear:both }
ol, ul { margin:0; padding:0;}
li { list-style: none; margin:0; padding:0;}
<!-- "li span.odfLiEnd" - IE 7 issue-->
li span. { clear: both; line-height:0; width:0; height:0; margin:0; padding:0; }
span.footnodeNumber { padding-right:1em; }
span.annotation_style_by_filter { font-size:95%; font-family:Arial; background-color:#fff000; margin:0; border:0; padding:0; }
* { margin:0;}
.fr1 { border-style:none; font-size:12pt; margin-bottom:0in; margin-left:0in; margin-right:0in; margin-top:0in; padding:0in; font-family:Liberation Serif; text-align:center; vertical-align:top; writing-mode:lr-tb; }
.fr2 { font-size:12pt; font-family:Liberation Serif; text-align:center; vertical-align:top; writing-mode:lr-tb; margin-left:0in; margin-right:0in; margin-top:0in; margin-bottom:0in; padding:0in; border-style:none; }
.Figure { font-size:12pt; font-style:italic; margin-bottom:0.0835in; margin-top:0.0835in; font-family:Liberation Serif; writing-mode:page; }
.P1 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; text-align:left ! important; color:#000000; font-weight:normal; }
.P10 { font-size:14pt; font-family:Arial; writing-mode:page; text-align:center ! important; }
.P11 { font-size:14pt; font-family:Arial; writing-mode:page; text-align:center ! important; }
.P12 { font-size:14pt; font-family:Arial; writing-mode:page; text-align:left ! important; font-weight:bold; }
.P13 { font-size:28pt; font-family:Arial; writing-mode:page; text-align:center ! important; }
.P14 { font-size:12pt; font-family:Arial; writing-mode:page; text-align:left ! important; font-weight:normal; }
.P15 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; text-align:left ! important; font-weight:normal; }
.P16 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; text-align:left ! important; font-weight:normal; }
.P17 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; text-align:left ! important; font-weight:normal; }
.P18 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; text-align:left ! important; font-weight:bold; }
.P19 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; text-align:left ! important; font-weight:bold; }
.P2 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; text-align:left ! important; color:#000000; font-weight:normal; }
.P20 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; text-align:left ! important; font-weight:bold; }
.P21 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; text-align:left ! important; }
.P22 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; }
.P23 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; }
.P24 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; font-weight:normal; }
.P25 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; font-weight:normal; }
.P26 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; }
.P27 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; }
.P28 { font-size:26pt; font-family:Arial; writing-mode:page; text-align:center ! important; }
.P29 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; vertical-align:super; font-size:58%;background-color:transparent; }
.P3 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; text-align:left ! important; color:#000000; font-weight:normal; }
.P30 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P31 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P32 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P33 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P34 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P35 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P36 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P37 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P38 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P39 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P4 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; text-align:left ! important; color:#000000; font-weight:normal; }
.P40 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P41 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P42 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P43 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P44 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P45 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P46 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P47 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P48 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P49 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P5 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; text-align:left ! important; color:#000000; font-weight:normal; }
.P50 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P51 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P52 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P53 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P54 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P55 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P56 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P57 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P58 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P59 { font-size:14pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:bold; }
.P6 { font-size:14pt; font-family:Arial; writing-mode:page; line-height:150%; text-align:left ! important; color:#000000; font-weight:bold; }
.P60 { font-size:14pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:bold; }
.P61 { font-size:14pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:bold; }
.P62 { font-size:12pt; font-family:Ubuntu Mono; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; background-color:#e6e6e6; color:#000000; font-weight:normal; }
.P63 { font-size:12pt; font-family:Ubuntu Mono; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; background-color:#e6e6e6; color:#000000; font-weight:normal; }
.P64 { font-size:12pt; font-family:Ubuntu Mono; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; background-color:#e6e6e6; color:#000000; font-weight:normal; }
.P65 { font-size:12pt; font-family:Ubuntu Mono; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; background-color:#e6e6e6; color:#000000; font-weight:normal; }
.P66 { font-size:12pt; font-family:Ubuntu Mono; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; background-color:#e6e6e6; color:#000000; font-weight:normal; }
.P67 { font-size:12pt; font-family:Ubuntu Mono; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; background-color:#e6e6e6; color:#000000; font-weight:normal; }
.P68 { font-size:12pt; font-family:Ubuntu Mono; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; background-color:#e6e6e6; color:#000000; font-weight:normal; }
.P69 { font-size:12pt; font-family:Ubuntu Mono; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; background-color:#e6e6e6; color:#000000; font-weight:normal; }
.P7 { font-size:14pt; font-family:Arial; writing-mode:page; text-align:left ! important; }
.P70 { font-size:12pt; font-family:Ubuntu Mono; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; background-color:#e6e6e6; color:#000000; font-weight:normal; }
.P71 { font-size:12pt; font-family:Ubuntu Mono; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; background-color:#e6e6e6; color:#000000; font-weight:normal; }
.P72 { font-size:12pt; font-family:Ubuntu Mono; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; background-color:#e6e6e6; color:#000000; font-weight:normal; }
.P73 { font-size:12pt; font-family:Ubuntu Mono; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; background-color:#e6e6e6; color:#000000; font-weight:normal; }
.P74 { font-size:12pt; font-family:Ubuntu Mono; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; background-color:#e6e6e6; color:#000000; font-weight:normal; }
.P75 { font-size:14pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; font-weight:bold; }
.P76 { font-size:14pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; font-weight:bold; }
.P77 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; font-weight:bold; }
.P78 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; font-weight:bold; }
.P79 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; font-weight:bold; }
.P8 { font-size:14pt; font-family:Arial; writing-mode:page; text-align:center ! important; }
.P80 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; font-weight:bold; }
.P81 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; font-weight:bold; }
.P82 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; font-weight:bold; }
.P83 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; font-weight:bold; }
.P84 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; font-weight:bold; }
.P85 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0.4925in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; font-weight:bold; }
.P86 { font-size:12pt; font-family:Arial; writing-mode:page; margin-left:0.4925in; margin-right:0in; line-height:150%; text-align:left ! important; text-indent:0in; color:#000000; font-weight:normal; }
.P87 { font-size:10pt; margin-bottom:0.1965in; margin-top:0in; font-family:DejaVu Sans Mono; writing-mode:page; line-height:150%; }
.P88 { font-size:12pt; font-family:Arial; writing-mode:page; text-align:left ! important; font-weight:normal; }
.P89 { font-size:12pt; font-family:Arial; writing-mode:page; text-align:left ! important; font-weight:normal; }
.P9 { font-size:14pt; font-family:Arial; writing-mode:page; text-align:center ! important; }
.P90 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; text-align:left ! important; font-weight:bold; }
.P91 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; text-align:left ! important; font-weight:bold; }
.P92 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; text-align:left ! important; }
.P93 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; text-align:left ! important; }
.P94 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; text-align:left ! important; }
.P95 { font-size:12pt; font-family:Arial; writing-mode:page; line-height:150%; text-align:left ! important; color:#000000; font-weight:normal; }
.P96 { font-size:14pt; font-family:Arial; writing-mode:page; text-align:left ! important; font-weight:bold; }
.P97 { font-size:14pt; font-family:Arial; writing-mode:page; line-height:150%; text-align:left ! important; color:#000000; font-weight:bold; }
.Bullet_20_Symbols { font-family:OpenSymbol; }
.Internet_20_link { color:#000080; text-decoration:underline; }
.T1 { font-weight:normal; }
.T10 { color:#000000; font-weight:normal; }
.T105 { font-size:12pt; }
.T11 { color:#000000; font-weight:normal; }
.T114 { font-family:Ubuntu Mono; }
.T12 { color:#000000; font-weight:normal; }
.T120 { font-family:Arial; font-size:12pt; }
.T121 { font-family:Arial; font-size:12pt; }
.T122 { font-family:Arial; font-size:12pt; font-weight:normal; }
.T13 { color:#000000; font-weight:normal; }
.T14 { color:#000000; font-weight:normal; }
.T15 { color:#000000; font-weight:normal; }
.T16 { color:#000000; font-weight:normal; }
.T17 { color:#000000; font-weight:normal; }
.T18 { color:#000000; font-weight:normal; }
.T19 { color:#000000; font-weight:normal; }
.T2 { font-weight:normal; }
.T20 { color:#000000; font-weight:normal; }
.T21 { color:#000000; font-weight:normal; }
.T22 { color:#000000; font-weight:normal; }
.T23 { color:#000000; font-weight:normal; }
.T24 { color:#000000; font-weight:normal; }
.T25 { color:#000000; font-weight:normal; }
.T26 { color:#000000; font-weight:normal; }
.T27 { color:#000000; font-weight:normal; }
.T28 { color:#000000; font-weight:normal; }
.T29 { color:#000000; font-weight:normal; }
.T3 { color:#000000; font-weight:normal; }
.T30 { color:#000000; font-weight:normal; }
.T31 { color:#000000; font-weight:normal; }
.T32 { color:#000000; font-weight:normal; }
.T33 { color:#000000; font-weight:normal; }
.T34 { color:#000000; font-weight:normal; }
.T35 { color:#000000; font-weight:normal; }
.T36 { color:#000000; font-weight:normal; }
.T37 { color:#000000; font-weight:normal; }
.T38 { color:#000000; font-weight:normal; }
.T39 { color:#000000; font-weight:normal; }
.T4 { color:#000000; font-weight:normal; }
.T40 { color:#000000; font-weight:normal; }
.T41 { color:#000000; font-weight:normal; }
.T42 { color:#000000; vertical-align:sub; font-size:58%;font-weight:normal; }
.T43 { color:#000000; vertical-align:sub; font-size:58%;font-weight:normal; }
.T44 { color:#000000; vertical-align:super; font-size:58%;font-weight:normal; }
.T45 { color:#000000; vertical-align:super; font-size:58%;font-weight:normal; }
.T46 { color:#000000; font-size:12pt; font-weight:normal; }
.T47 { color:#000000; font-size:12pt; font-weight:normal; }
.T48 { color:#000000; font-weight:normal; }
.T49 { color:#000000; }
.T5 { color:#000000; font-weight:normal; }
.T50 { color:#000000; }
.T51 { color:#000000; }
.T52 { color:#000000; }
.T53 { color:#000000; }
.T54 { color:#000000; font-size:12pt; font-weight:normal; }
.T55 { color:#000000; font-size:12pt; font-weight:normal; }
.T56 { color:#000000; font-size:12pt; font-weight:normal; }
.T57 { color:#000000; font-size:12pt; font-weight:normal; }
.T58 { color:#000000; font-size:12pt; font-weight:normal; }
.T59 { color:#000000; font-size:12pt; font-weight:normal; }
.T6 { color:#000000; font-weight:normal; }
.T60 { color:#000000; font-size:12pt; font-weight:normal; }
.T61 { color:#000000; font-size:12pt; font-weight:normal; }
.T62 { color:#000000; font-size:12pt; font-weight:normal; }
.T63 { color:#000000; font-size:12pt; font-weight:normal; }
.T64 { color:#000000; }
.T65 { color:#000000; font-size:12pt; letter-spacing:normal; font-style:normal; font-weight:normal; }
.T66 { color:#000000; font-family:Arial; font-size:12pt; letter-spacing:normal; font-style:normal; text-decoration:none ! important; font-weight:normal; }
.T67 { color:#000000; font-family:Arial; font-size:12pt; letter-spacing:normal; font-style:normal; text-decoration:none ! important; font-weight:normal; }
.T68 { color:#000000; font-family:Arial; font-size:12pt; letter-spacing:normal; font-style:normal; text-decoration:none ! important; font-weight:normal; }
.T69 { color:#000000; font-family:Arial; font-size:12pt; letter-spacing:normal; font-style:normal; text-decoration:none ! important; font-weight:normal; }
.T7 { color:#000000; font-weight:normal; }
.T70 { color:#000000; font-family:Arial; font-size:12pt; letter-spacing:normal; font-style:normal; text-decoration:none ! important; font-weight:normal; }
.T71 { color:#000000; font-family:Arial; font-size:12pt; letter-spacing:normal; font-style:normal; text-decoration:none ! important; font-weight:normal; }
.T72 { color:#000000; font-family:Arial; font-size:12pt; letter-spacing:normal; font-style:normal; font-weight:normal; }
.T73 { color:#000000; font-family:Arial; font-size:12pt; letter-spacing:normal; font-style:normal; font-weight:normal; }
.T75 { vertical-align:sub; font-size:58%;}
.T76 { vertical-align:sub; font-size:58%;font-weight:normal; }
.T78 { vertical-align:super; font-size:58%;}
.T79 { vertical-align:super; font-size:58%;}
.T8 { color:#000000; font-weight:normal; }
.T80 { vertical-align:super; font-size:58%;background-color:transparent; }
.T81 { vertical-align:super; font-size:58%;font-weight:normal; }
.T82 { vertical-align:super; font-size:58%;font-weight:normal; background-color:transparent; }
.T83 { vertical-align:super; font-size:58%;font-size:12pt; font-weight:normal; background-color:transparent; }
.T84 { font-size:12pt; font-weight:normal; }
.T85 { font-size:12pt; font-weight:normal; }
.T86 { font-size:12pt; font-weight:normal; background-color:transparent; }
.T87 { font-size:12pt; font-weight:normal; background-color:transparent; }
.T88 { font-size:12pt; font-weight:normal; background-color:transparent; }
.T9 { color:#000000; font-weight:normal; }
<!-- ODF styles with no properties representable as CSS -->
.T100 .T101 .T102 .T103 .T104 .T106 .T107 .T108 .T109 .T110 .T111 .T112 .T113 .T115 .T116 .T117 .T118 .T119 .T74 .T77 .T89 .T90 .T91 .T92 .T93 .T94 .T95 .T96 .T97 .T98 .T99 { }
</style></head><body dir="ltr" style="max-width:8.2681in;margin-top:0.7874in; margin-bottom:0.7874in; margin-left:0.7874in; margin-right:0.7874in; writing-mode:lr-tb; "><p class="P7">EMBL Centre for Network Analysis <span class="T74">workshop 9-11 December 2015</span></p><p class="P13"> </p><p class="P13"> </p><p class="P28">Data integration and mining with graphs:</p><p class="P28"> </p><p class="P28">Kernels on graph nodes</p><p class="P8"> </p><p class="P8"> </p><p class="P9">Jean-Karim Hériché</p><p class="P10">Ellenberg lab</p><p class="P9">Cell Biology and Biophysics unit</p><p class="P8"> </p><p class="P11"> </p><p class="P8"> </p><p class="P8"> </p><p class="P8"> </p><p class="P8"> </p><p class="P8"> </p><p class="P12">Summary</p><p class="P14"> </p><p class="P14"><span> <span class="T94">Biological data sets </span><span class="T95">can be</span><span class="T94"> represented as graphs for the purpose of data integration and mining. We will introduce kernels on graph nodes as </span><span class="T98">measures of similarity between genes</span><span class="T99">,</span><span class="T98"> show how to compute some </span><span class="T99">kernels</span><span class="T94"> for various biological data sources</span><span class="T96"> </span><span class="T94">and how they can be used for data integration. We will apply the kernels to the task of candidate genes selection.</span></span></p><p class="P88"> </p><p class="P88"> </p><p class="P88"> <span> </span></p><p class="P88"><span> </span></p><p class="P88"> </p><p class="P89"><br/></p><p class="P96">Motivation</p><p class="P16"> </p><p class="P17"><span> While large, nearly genome-wide studies are now possible, they remain beyond the reach of most laboratories as they require a large investment in reagents, equipments and data management. In most cases, locally available resources only allow a few dozen to a few hundred genes to be studied, with candidate genes usually selected either from a gene family (e.g. kinases) or after extensive literature and database searches. In the latter case, candidate genes are identified by association to genes already known to be implicated in the process under consideration. Given that links between genes can be indirect and that there are multiple sources of data, this selection process has become a difficult task. In this context, efficient automatic candidate gene selection has the potential to maximize returns on investment in experiments.</span></p><p class="P18"><span class="T1"> Selecting good candidate genes means finding genes that have a high probability of being part of or related to the process one wants to study. If this process is defined in terms of a few genes known to participate in that process, then good candidate genes would be those that are functionally linked to these known genes. </span><span class="T3">Compared to large scale, systematic experiments, </span><span class="T17">finding new candidates based on network context analysis</span><span class="T3"> has the added advantage to provide </span><span class="T17">hypothesis about likely underlying molecular interaction mechanisms involving the predicted targets.</span></p><p class="P19"><span class="T17"> Many methods using the "guilt by association" principle have been developed for predicting gene function and applied to model organisms (for review see [1]). Among these, kernel-based methods have gained in popularity [2,3]. </span><span class="T3">In this context, despite</span><span class="T17"> representing a natural measure of similarity between genes, kernels on graph nodes have received little attention.</span></p><p class="P1"> </p><p class="P6">Kernels</p><p class="P15"><span class="T49"> </span><span class="T50">K</span><span class="T52">ernels and kernel methods </span><span class="T53">are well described and explained in </span><span class="T52">[4] </span><span class="T53">which is suggested reading for anyone interested in the kernel framework.</span></p><p class="P3"> </p><p class="P85"><span class="T49">- </span><span class="T18">What is a kernel ?</span></p><p class="P20"><span class="T17"> A kernel is a function that </span><span class="T3">maps data points (e.g. genes) into some high dimensional space (called feature space) and computes the dot product of the corresponding vectors in that space. That is, for a mapping function Φ and 2 genes x and y, the kernel function K computes K(x,y) = ⟨Φ(x),Φ(y)⟩. A kernel matrix contains the evaluation of a kernel function for all pairs of data points under consideration. </span><span class="T12">Note that w</span><span class="T11">e often use the term kernel to </span><span class="T11">refer to the kernel matrix instead of the function. </span></p><p class="P2"> </p><p class="P86">- Useful properties of kernels:</p><ul><li><p class="P92" style="margin-left:0cm;"><span class="Bullet_20_Symbols" style="display:block;float:left;min-width:0.635cm;">•</span><span class="T55">Because it corresponds to a dot product, a kernel</span><span class="T56"> can be viewed as a similarity </span><span class="T55">measure. After normalization, kernels can be viewed as giving the probability of items being in the same class minus the probability that they are in different classes.</span><span class="odfLiEnd"/> </p></li><li><p class="P95" style="margin-left:0cm;"><span class="Bullet_20_Symbols" style="display:block;float:left;min-width:0.635cm;">•</span>Different kernels corresponds to mappings to different feature spaces so capture different notions of similarity.<span class="odfLiEnd"/> </p></li><li><p class="P93" style="margin-left:0cm;"><span class="Bullet_20_Symbols" style="display:block;float:left;min-width:0.635cm;">•</span><span class="T54">Any symmetric, positive semi-definite matrix (i.e. whose eigenvalues are ≥ 0) correspond</span><span class="T57">s</span><span class="T54"> to </span><span class="T57">pairwise</span><span class="T54"> dot product</span><span class="T57">s</span><span class="T54"> in some feature space </span><span class="T57">and therefore represents a valid kernel. This means we can use any similarity matrix that is symmetric, positive semi-definite as a kernel and that we can apply kernel methods to any type of data as long as we can define some similarity measure between the data points.</span><span class="odfLiEnd"/> </p></li><li><p class="P94" style="margin-left:0cm;"><span class="Bullet_20_Symbols" style="display:block;float:left;min-width:0.635cm;">•</span><span class="T57">K</span><span class="T54">ernels can be </span><span class="T58">transformed or </span><span class="T54">combined using various operations (e.g. linear combination) to produce other kernels. </span><span class="odfLiEnd"/> </p></li></ul><p class="P5"><span class="T103">Proof for these properties can be found in </span><span class="T104">chapter 2 and 3 of [4].</span></p><p class="P4"> </p><p class="P75"><span class="T62"> - </span><span class="T63">Wh</span><span class="T62">y use</span><span class="T63"> kernel</span><span class="T62">s</span><span class="T63"> ?</span></p><p class="P77"><span class="T18"> M</span><span class="T17">any algorithms work using only the dot products between input vectors e.g. PCA, LDA, k-means and other clustering algorithms, support vector machine... The kernel trick often mentioned in the literature refers to using kernel matrices to provide these dot products. This provides modularity as any kernel can work with any dot product-based algorithm. Kernels can be used to </span><span class="T19">map</span><span class="T17"> complex patterns into a feature space where they </span><span class="T19">appear linear allowing them to be efficiently identified </span><span class="T20">using common algorithms</span><span class="T17">. </span><span class="T21">The last property listed above</span><span class="T4"> means that a combination of similarity matrices can still be interpreted as a matrix of similarities if the combined matrices are valid kernels. This property makes kernels particularly attractive for data integration because it provides a principled way of combining information from different sources into one similarity matrix.</span></p><p class="P30"> </p><p class="P59">Measures of graph nodes similarity</p><p class="P78"><span class="T17"> </span><span class="T22">A </span><span class="T23">good measure </span><span class="T22">of similarity between nodes of a graph </span><span class="T23">should satisfy the </span><span class="T24">guilt-by-association </span><span class="T23">intuition that two nodes are more similar to each other when </span><span class="T24">they are connected to similar nodes. Our intuition also tells us that nodes are more similar when </span><span class="T23">there are more connections between them and the paths connecting them are shorter. </span><span class="T22">Therefore, t</span><span class="T23">o measure similarity between nodes, we want to take into account how they are connected.</span><span class="T24"> </span><span class="T25">Some simple measures would </span><span class="T24">involve</span><span class="T25"> counting the number of hops or summing the weights of the hops between two nodes </span><span class="T26">but it's easy to find examples where these measures don't capture meaningful similarity. </span></p><p class="P31"> </p><ul><li><p class="P90" style="margin-left:0cm;"><span class="Bullet_20_Symbols" style="display:block;float:left;min-width:0.635cm;">•</span><span class="T26">R</span><span class="T17">andom walk</span><span class="T27">s</span><span class="T17"> on a graph:</span><span class="odfLiEnd"/> </p></li></ul><p class="P79"><span class="T26"> </span><span class="T21">One way to analyse connectivity in a graph is to select a starting node then move to a random neighbour of this node then move to a random neighbour of that node and so on for an infinite number of steps. The sequence of randomly visited nodes in this process is called a random walk on the graph and can be viewed as a Markov chain. </span><span class="T28">A short simulation can show that a random walk hits closer, more connected nodes more often than more distant, less connected ones </span><span class="T29">or, seen another way,</span><span class="T28"> it ta</span><span class="T30">ke</span><span class="T28">s less time to reach a node from closer, more connected nodes than from more distant, less connected nodes. </span></p><p class="P32"> </p><ul><li><p class="P91" style="margin-left:0cm;"><span class="Bullet_20_Symbols" style="display:block;float:left;min-width:0.635cm;">•</span><span class="T28">S</span><span class="T17">imilarit</span><span class="T31">y</span><span class="T17"> measures from random walks</span><span class="odfLiEnd"/> </p></li></ul><p class="P80"><span class="T28"> The </span><span class="T32">expected </span><span class="T28">time (= number of steps) it takes in a random walk to reach a node starting from a different node is called hitting time. </span><span class="T33">Clearly, </span><span class="T34">although the hitting time is a distance (similar nodes have a small hitting time and dissimilar nodes have a large hitting time), it</span><span class="T33"> captures the notion of similarity we outlined above. However, it is not symmetric as H(</span><span class="T35">i,j</span><span class="T33">) is generally not equal to H(</span><span class="T35">j,i</span><span class="T33">) </span><span class="T36">but it can be used to derive the commute time between </span><span class="T35">i</span><span class="T36"> and </span><span class="T35">j</span><span class="T36"> which measures how long it takes to go from </span><span class="T35">i</span><span class="T36"> to </span><span class="T35">j</span><span class="T36"> and back to </span><span class="T35">i</span><span class="T36">: CT(</span><span class="T35">i,j</span><span class="T36">) = H(</span><span class="T35">i,j</span><span class="T36">)+H(</span><span class="T35">j,i</span><span class="T36">) = CT(</span><span class="T35">j,i</span><span class="T36">) and does capture the same notion of similarity. </span><span class="T32">Other similarity measures can be derived from the random walk. One is called random forest (RF</span><span class="T39">)</span><span class="T32"> </span><span class="T35">because it</span><span class="T6"> arises from the enumeration of the spanning rooted forests in the graph and measures the relative "forest-accessibility" between nodes [</span><span class="T7">5</span><span class="T6">]</span><span class="T32">) and RF(</span><span class="T35">i,j</span><span class="T32">) corresponds to the probability of reaching node </span><span class="T35">j</span><span class="T32"> in a random number of steps starting from node </span><span class="T35">i [</span><span class="T34">6</span><span class="T35">]</span><span class="T32">. </span><span class="T37">Finally, if instead of jumping from node to node we have the </span><span class="T35">random </span><span class="T37">walk move in a continuous fashion along the edges, we simulate a diffusion process. </span><span class="T38">Measuring similarity in a diffusion process involves enumerating all paths between two nodes while penalizing the longer paths </span><span class="T35">[4]</span><span class="T38">.</span></p><p class="P33"> </p><p class="P76"><span class="T51">C</span><span class="T49">omputing kernels on graph nodes</span></p><p class="P81"><span class="T5"> Each data set is viewed as the adjacency matrix A of a weighted undirected graph and a value A</span><span class="T42">ij</span><span class="T5"> of 0 indicates no edge between i and j. In the following, D denotes the </span><span class="T5">diagonal degree matrix, I the identity matrix, L the graph Laplacian (L= D-A). </span><span class="T6">We will compute the following kernels:</span></p><p class="P54"> </p><p class="P54"><span> - kernelized adjacency matrix: </span></p><p class="P24"><span> Our data sources already represent a measure of similarity between genes. To ensure that the matrix is positive semi-definite <span class="T77">and therefore a valid kernel, </span> <span class="T77">we can</span> shif<span class="T77">t</span> its eigenvalues. This is accomplished by adding a sufficiently large constant to the diagonal of each matrix. Here we use the absolute value of the smallest eigenvalue. </span></p><p class="P22"><span class="T94"> K</span><span class="T75">A</span><span class="T94"> = A+λI with λ= absolute value of smallest eigenvalue of A</span></p><p class="P27"> </p><p class="P24"><span> - Commute time kernel:</span></p><p class="P25"><span> <span class="T89">The commute time between two nodes can be computed using L</span><span class="T78">+</span><span class="T89">, the pseudo-inverse of the Laplacian [7]. However, the commute time itself is a distance measure but it can be shown that L</span><span class="T78">+</span><span class="T89"> is the covariance matrix associated with the nodes in a feature space where the nodes are separated by their commute time [7]. Therefore (also because L is symmetric, positive and semi-definite) L</span><span class="T78">+</span><span class="T89"> is symmetric, positive and semi-definite and a valid kernel. It is called the commute time kernel. </span>It also has an interpretation in terms of electrical network as the commute time is equal to the effective resistance between two nodes [<span class="T90">8</span>]. <span class="T91">Note that for very large graphs, the commute time distance becomes meaningless [9] but this can be dealt with.</span></span></p><p class="P22"><span class="T94"> K</span><span class="T75">CT</span><span class="T94"> = L</span><span class="T79">+</span><span class="T94"> (pseudo-inverse of L)</span></p><p class="P27"> </p><p class="P24"><span> - Random forest kernel:</span></p><p class="P24"><span> This kernel arises from the enumeration of the spanning rooted forests in the graph and measures the relative "forest-accessibility" between nodes <span class="T92">and</span> has an interpretation in terms of probabilities of reaching a node in a random walk with random number of steps. <span class="T92">Its derivation is explained in</span> [<span class="T92">5,6</span>].</span></p><p class="P22"><span class="T94"> K</span><span class="T75">RF</span><span class="T94"> = (I+L)</span><span class="T80">-1</span></p><p class="P29"> </p><p class="P24"><span> - von Neumann diffusion kernel:</span></p><p class="P24"><span> This kernel enumerates all paths between 2 nodes while penalizing the longer paths [<span class="T90">4</span>]:</span></p><p class="P23"><span class="T2"> K</span><span class="T76">VN</span><span class="T2">= Σ</span><span class="T76">k</span><span class="T2">α</span><span class="T81">k</span><span class="T2">A</span><span class="T81">k </span><span class="T84">= (I-</span><span class="T85">γ</span><span class="T84">A)</span><span class="T81">-1 </span></p><p class="P23"><span class="T82"> </span><span class="T86">The penalizing factor is α</span><span class="T82">k</span><span class="T86"> (k being the length of the path) and the kernel is defined for 0<</span><span class="T87">γ</span><span class="T86"><ρ</span><span class="T83">−1</span><span class="T86"> with ρ being the spectral radius of A. </span><span class="T88">W</span><span class="T86">e </span><span class="T87">will use</span><span class="T86"> </span><span class="T88">2</span><span class="T86"> values </span><span class="T88">of </span><span class="T87">γ</span><span class="T94">: </span><span class="T85">γ</span><span class="T94"> = </span><span class="T97">0.99</span><span class="T94">ρ</span><span class="T79">−1 </span><span class="T97">and</span><span class="T94"> </span><span class="T85">γ</span><span class="T94"> = 0.</span><span class="T97">000</span><span class="T94">1ρ</span><span class="T79">−1 </span><span class="T59">which correspond respectively to </span><span class="T60">high</span><span class="T61"> </span><span class="T59">and low values of the admissible range for </span><span class="T46">γ</span><span class="T47">. </span><span class="T46">Note that in a supervised setting with enough training data, one can learn a value for γ from the data.</span></p><p class="P55"> </p><p class="P82"><span class="T10"> </span><span class="T40">- Degree-based similarity:</span></p><p class="P26"><span> We also compute a similarity matrix based only on node degree as in [<span class="T93">10</span>]:</span></p><p class="P82"><span class="T10"> K</span><span class="T43">DB</span><span class="T10">= BB</span><span class="T45">T</span><span class="T10"> where B=D</span><span class="T64">e</span><span class="T10"> and </span><span class="T64">e</span><span class="T10"> is the all-one vector (</span><span class="T64">e</span><span class="T45">T</span><span class="T10">=(1,1,...,1)).</span></p><p class="P84"><span class="T10">K</span><span class="T43">DB</span><span class="T48">(i,j) is simply the product of the degree of node i by the degree of node j. </span><span class="T14">Note that this is a valid kernel by construction.</span></p><p class="P55"> </p><p class="P82"><span class="T10"> </span><span class="T13">- Normalization:</span></p><p class="P83"><span class="T40"> Genes with multiple functions or that are more studied tend to have more links to other genes. To compensate for this effect, each kernel is normalized </span><span class="T15">to the range [-1,1] </span><span class="T40">by diag(K)</span><span class="T44">-1/2</span><span class="T40">Kdiag(K)</span><span class="T44">-1/2</span><span class="T40"> (which corresponds to computing the cosine of the vectors in the feature space of the kernel). </span><span class="T41">This also seems to alleviate the commute time issue with large graphs [9].</span></p><p class="P55"> </p><p class="P82"><span class="T10">O</span><span class="T3">ther kernels </span><span class="T9">and similarity measures</span><span class="T3"> </span><span class="T10">can be found</span><span class="T3"> in [</span><span class="T8">1</span><span class="T16">1</span><span class="T3">].</span></p><p class="P34"> </p><p class="P60">Application to finding functionally-related genes</p><p class="P56"><span class="T98"> </span><span class="T101"> </span><span class="T100">The approach we will follow is that of semi-supervised learning by label propagation. We assume that similar genes have similar function </span><span class="T101">and we can propagate function from genes with known function to similar genes for which the function is unknown. </span><span class="T100">The data processing pipeline is shown in Figure 1.</span></p><!--Next 'div' was a 'text:p'.--><div class="P57"><!--Next 'div' is emulating the top hight of a draw:frame.--><!--Next '
div' is a draw:frame.
--><div style="width:6.6929in; padding:0; float:left; position:relative; left:0cm; " class="fr1" id="Frame1"><!--Next 'div' was a 'draw:text-box'.--><div style="min-height:2.4882in;"><!--Next 'div' was a 'text:p'.--><div class="Figure"><!--Next 'div' is emulating the top hight of a draw:frame.--><!--Next '
div' is a draw:frame.
--><div style="height:2.4882in;width:6.6929in; padding:0; float:left; position:relative; left:0cm; " class="fr2" id="Image1"><img style="height:6.32cm;width:17cm;" alt="" src=""/></div><!--Next 'div' added for floating.--><div style="position:relative; left:0cm;">Figure <a id="refFigure0"/>1: Semi-supervised learning by label propagation</div></div><div style="clear:both; line-height:0; width:0; height:0; margin:0; padding:0;"> </div></div></div></div><div style="clear:both; line-height:0; width:0; height:0; margin:0; padding:0;"> </div><p class="P61">Practical</p><p class="P37">For th<span class="T107">is</span> session, we're going to work with the text mining data <span class="T106">(TM) using R.</span></p><p class="P38">First we need to load the librar<span class="T111">ies</span> we need:</p><p class="P62">## Load required packages</p><p class="P62">library(igraph) # to manipulate graphs</p><p class="P62">library(MASS) # for pseudoinverse function ginv()</p><p class="P62">library(rARPACK) # for eigs(), to compute limited number of eigenvalues</p><p class="P38">We're also going to put the data directory in a variable so that we don't need to type it in each time:</p><p class="P62">## Where the data is</p><p class="P62">dataDir <- "~/Documents/Bio-IT/courses/Data_integration_and_mining_with_graphs"</p><p class="P38">Now we read the TM data from the file. The file contains a tab-delimited list of gene pairs with their similarities.</p><p class="P62">## Read data as list of tab-delimited pairwise similarities</p><p class="P62">## This is equivalent to a list of weighted graph edges</p><p class="P62">TM <- read.delim(file.path(dataDir,"TM.txt"), header=FALSE)</p><p class="P38">We name the variables for clarity and convenience (it's easier to understand what the variable weight means than what V3 means):</p><p class="P62">## Name variables for convenience (otherwise, by default, weights are in variable named V3)</p><p class="P62">colnames(TM) <- c("geneA","geneB","weight")</p><p class="P38">We convert the data to a graph and get its adjacency matrix:</p><p class="P62">## Form undirected graph from edge list</p><p class="P62">G <- graph.data.frame(TM,directed=FALSE)</p><p class="P62">## Get adjacency matrix</p><p class="P62">A <- as_adjacency_matrix(G,type="both",names=TRUE,sparse=FALSE,attr="weight")</p><p class="P39"> </p><p class="P39">We're now ready to play with the data:</p><p class="P36">1 - Is the original TM matrix a valid kernel ? If not how would you turn it into a kernel ?</p><p class="P39">As we've seen, a sufficient condition is that the matrix is symmetric and positive semi-definite:</p><p class="P63">## Check if A is a valid kernel i.e. symmetric positive semi-definite</p><p class="P63">isSymmetric(A) # is it symmetric ?</p><p class="P63">Eig <- eigs(A,1,which="SR") # compute the smallest eigenvalue and corresponding eigenvector</p><p class="P63">Eig$values # smallest eigenvalue</p><p class="P39">The TM adjacency matrix is symmetric but not positive because the smallest eigenvalue is negative. To make it positive semi-definite, we simply add the absolute value of the smallest eigenvalue to the diagonal:</p><p class="P63">## Make symmetric adjacency matrix a valid kernel</p><p class="P63">I <- diag(nrow(A)) # Identity matrix of the same size as A</p><p class="P63">A <- A + I*abs(Eig$values) # Now smallest eigenvalue of A is 0, A is a valid kernel</p><p class="P35"> </p><p class="P35">2- Compute some kernels for the data including the degree-based similarity <span class="T108">and</span> <span class="T108">normalize the matrices:</span></p><p class="P40">Let's start with the commute time kernel:</p><p class="P64">## Get Laplacian matrix</p><p class="P64">L <- laplacian_matrix(G,normalized=FALSE,sparse=FALSE) # weights are set to the weight attribute by default</p><p class="P64">## Commute time kernel</p><p class="P64">CT <- ginv(L) <span class="T109"># Matrix pseudo-inverse</span></p><p class="P64">## Normalize kernel</p><p class="P64">D <- diag(1./sqrt(diag(CT)))</p><p class="P64">CT <- D %*% CT %*% D</p><p class="P41">Now for the degree-based kernel:</p><p class="P65">## Degree-based kernel</p><p class="P65">B <- as.matrix(rowSums(A)) # column vector</p><p class="P65">DB <- B %*% t(B)</p><p class="P66">## Normalize kernel</p><p class="P66">D <- diag(1./sqrt(diag(DB)))</p><p class="P66">DB <- D %*% DB %*% D</p><p class="P42">And the diffusion kernel with two different values of its parameter:</p><p class="P66">## von Neumann diffusion kernel</p><p class="P66">## using different parameter values between 0 and 1/rho where rho = spectral radius of A</p><p class="P66">rhoA <- eigs(A,1,which="LM")</p><p class="P66">gamma <- 0.99/rhoA$values # large value, make sure gamma < 1/rho so that matrix is invertible</p><p class="P66">VNmax <- solve(I - gamma*A) <span class="T112"># Matrix inverse</span></p><p class="P66">## Normalize kernel</p><p class="P66">D <- diag(1./sqrt(diag(VNmax)))</p><p class="P66">VNmax <- D %*% VNmax %*% D</p><p class="P66">## VN kernel for a small parameter value</p><p class="P66">gamma <- 0.0001/rhoA$values # small value</p><p class="P66">VNmin <- solve(I - gamma*A)</p><p class="P66">D <- diag(1./sqrt(diag(VNmin)))</p><p class="P66">VNmin <- D %*% VNmin %*% D</p><p class="P40"> </p><p class="P35">3- Using different kernels, retrieve the top 20 genes related to the ANAPC, a random list of genes and a mixed list of ANAPC and DNA replication genes.</p><p class="P43">First we name the kernel rows and columns with the corresponding gene symbols to allow convenient access to similarities by gene symbol:</p><p class="P67">## Name kernel rows and columns for later convenience</p><p class="P67">colnames(CT) <- colnames(L)</p><p class="P67">rownames(CT) <- rownames(L)</p><p class="P67">colnames(VNmax) <- colnames(L)</p><p class="P67">rownames(VNmax) <- rownames(L)</p><p class="P67">colnames(VNmin) <- colnames(L)</p><p class="P67">rownames(VNmin) <- rownames(L)</p><p class="P67">rownames(DB) <- rownames(L)</p><p class="P44">Now we read the list of ANAPC genes:</p><p class="P68">## Read list of query genes</p><p class="P69">query_ANAPC <- read.table(file.path(dataDir,"query_ANAPC.txt"), quote="\"", <span class="T113">stringsAsFactors=FALSE) # stringsAsFactors=FALSE to keep gene symbols as strings</span></p><p class="P45">Query the CT kernel by extracting the columns corresponding to the query genes:</p><p class="P70">## Query CT kernel</p><p class="P71">CT_ANAPC<-CT[,<span class="T110">query_ANAPC$V1</span>]</p><p class="P47"><span class="T118">If a query gene is not in the data, we get an error, so it is safer to go by column indices as in the following which would also work </span>if we hadn't named the columns or hadn't read the gene symbols as strings with <span class="T114">stringsAsFactors=FALSE:</span></p><p class="P72">index <- colnames(CT) %in% query_ANAPC$V1</p><p class="P72">CT_ANAPC<-CT[,index]</p><p class="P46">Again, for convenience, we name the rows with the corresponding gene symbols:</p><p class="P72">## Name the rows</p><p class="P72">rownames(CT_ANAPC)<-rownames(CT)</p><p class="P46">We then sum the rows to get the sum of similarity of each gene to the query genes, sort from highest to lowest similarity and show the top 20:</p><p class="P72">## Sum the rows</p><p class="P72">CT_ANAPC <- rowSums(CT_ANAPC)</p><p class="P72">## Sort by decreasing values</p><p class="P72">CT_ANAPC <- sort(CT_ANAPC,decreasing=TRUE)</p><p class="P72">## Show the top 20</p><p class="P72">CT_ANAPC[1:20]</p><p class="P48"><span class="T117">We</span> can now proceed in the same way for the other kernels and query lists.</p><p class="P48"> </p><p class="P35">4- Inspect the results. Are they what you expected ?</p><p class="P52">We can see that when the query consists of known functionally-related genes, the genes returned at the top of the list are other genes known to be involved in the same process as the query genes. When using random or mixed queries, the returned genes don't seem to make sense.</p><p class="P52"> </p><p class="P35">5- How would you define the most relevant genes ?</p><p class="P49">Plot the ordered similarities to the query, <span class="T116">most of the time the top couple hundreds are enough.</span></p><p class="P73">plot(CT_ANAPC<span class="T116">[1:200], yaxt=”n”</span>) <span class="T116"># No ticks on y axis</span></p><p class="P74">axis(side = 2, at = seq(0,max(CT_ANAPC)+1,0.1)) # Add ticks every 0.1 on y axis</p><p class="P50">Notice the elbow followed by a long flat tail <span class="T115">of low values. Similarity values on the left of the elbow can be considered significantly higher than those from a random selection of genes.</span></p><p class="P50"> </p><p class="P35">6- How would you find which is the best kernel for this data set ?</p><p class="P51">First, we need lists of known functionally-related genes. These could be obtained from various databases e.g. Panther, KEGG, Reactome... <span class="T119">Note that the choice of reference database affects the definition of functional relationship.</span></p><p class="P51">Using these lists, we can take a cross-validation approach: For each list, take some genes as query and find the rank of the other genes from the list. We can define sensitivity as the fraction of these target genes that are in the top N%. If we assume that random genes are unlikely to be <span class="T117">functionally </span>related, we can estimate the false positive rate by measuring the fraction of target genes in the top N% <span class="T117">using random lists of genes matching those from the database</span>. <span class="T117">Plotting the average sensitivity and false positive rate across all gene lists for different rank thresholds allows to compare the different kernels. Usually the false positive rate is acceptably low for most kernels so the main difference between them comes from the difference in sensitivity.</span></p><p class="P53">The same approach can be used to see that data integration by combining kernels gives better results than individual kernels.</p><p class="P53"><span> </span></p><p class="P53"><span> </span></p><p class="P53"><span> </span></p><p class="P53"> </p><p class="P53"> </p><p class="P97">References</p><p class="P21"><span class="T105">1. Wang PI, Marcotte EM. </span><a href="http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2953423/" class="Internet_20_link">It's the machine that matters: Predicting gene function and phenotype from protein networks</a><span class="T65">. </span><span class="T105">J Proteomics 2010;73:2277-89.</span></p><p class="P87"><span class="T120">2. De Bie T, Tranchevent LC, van Oeffelen LM et al.. </span><a href="http://bioinformatics.oxfordjournals.org/content/23/13/i125.full" class="Internet_20_link"><span class="T120">Kernel-based data fusion for gene prioritization</span></a><span class="T120">. Bioinformatics 2007;23:i125-32.<br/>3. Qi Y, Suhail Y, Lin Y et al.. </span><a href="http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2593582/" class="Internet_20_link"><span class="T120">Finding friends and enemies in an enemies-only network: A graph diffusion kernel for predicting novel genetic interactions and co-complex membership from yeast genetic interactions</span></a><span class="T120">. Genome Research 2008;18:1991-2004.<br/></span><span class="T66">4</span><span class="T67">. Shawe-Taylor J, Christianini N. </span><a href="http://www.kernel-methods.net/" class="Internet_20_link"><span class="T121">Kernel methods for pattern analysis</span></a><span class="T67">. Cambridge: Cambridge University Press, 2004.<br/></span><span class="T73">5</span><span class="T72">. Chebotarev P, Shamis E. The </span><a href="http://arxiv.org/pdf/math/0602070v1.pdf" class="Internet_20_link"><span class="T121">Matrix-Forest Theorem and Measuring Relations in Small Social Groups</span></a><span class="T72">. Automation and Remote Control 1997;58:1505-1514.<br/></span><span class="T68">6</span><span class="T67">. Chebotarev P. </span><a href="http://arxiv.org/pdf/math/0612792.pdf" class="Internet_20_link"><span class="T121">Spanning forests and the golden ratio</span></a><span class="T67">. Discrete Applied Mathematics 2008;156: 813-821.<br/></span><span class="T73">7</span><span class="T72">. Fouss F, Pirotte A, Renders JM et al. </span><a href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.71.2194&rep=rep1&type=pdf" class="Internet_20_link"><span class="T121">Random-Walk Computation of Similarities between Nodes of a Graph with Application to Collaborative Recommendation</span></a><span class="T72">. IEEE Trans Knowl Data Eng 2007;19:355-369.<br/></span><span class="T68">8</span><span class="T67">. Qiu HJ, Hancock ER. </span><a href="https://www.computer.org/csdl/trans/tp/2007/11/i1873.pdf" class="Internet_20_link"><span class="T121">Clustering and embedding using commute times</span></a><span class="T67">. IEEE Trans Pattern Anal Mach Intell 2007;29:1873–1890.<br/></span><span class="T122">9. von Luxburg U, Radl A, Hein M. </span><a href="http://papers.nips.cc/paper/3891-getting-lost-in-space-large-sample-analysis-of-the-resistance-distance.pdf" class="Internet_20_link"><span class="T122">Getting lost in space: large sample analysis of the commute distance</span></a><span class="T122">. In Proceedings of the 23th neural information processing systems conference. NIPS 2010 (pp. 2622–2630).<br/></span><span class="T69">10</span><span class="T67">. Gillis J, Pavlidis P. </span><a href="http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0017258" class="Internet_20_link"><span class="T121">The Impact of Multifunctional Genes on "Guilt by Association" Analysis</span></a><span class="T67">. PLoS One 2011;6:e17258.<br/></span><span class="T70">11. Fouss </span><span class="T71">F</span><span class="T70">, Francoisse </span><span class="T71">K</span><span class="T70">, Yen </span><span class="T71">Y</span><span class="T68"> </span><span class="T71">et al. </span><a href="http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.434.5836&rep=rep1&type=pdf" class="Internet_20_link"><span class="T121">An experimental investigation of kernels on graphs for collaborative recommendation and semisupervised classification</span></a><span class="T68">. Neural Networks 2012; </span><span class="T71">31:</span><span class="T68">53–72.</span></p></body></html>