sable-wasm-thesis/references.bib at main · NineKa/sable-wasm-thesis · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
@article{8118483,
  author  = {L. {Wagner}},
  journal = {IEEE Spectrum},
  title   = {Turbocharging the web},
  year    = {2017},
  volume  = {54},
  number  = {12},
  pages   = {48-53},
  doi     = {10.1109/MSPEC.2017.8118483}
}

@inproceedings{10.1145/3062341.3062363,
  author    = {Haas, Andreas and Rossberg, Andreas and Schuff, Derek L. and Titzer, Ben L. and Holman, Michael and Gohman, Dan and Wagner, Luke and Zakai, Alon and Bastien, JF},
  title     = {Bringing the Web up to Speed with {W}eb{A}ssembly},
  year      = {2017},
  isbn      = {9781450349888},
  publisher = {Association for Computing Machinery},
  address   = {New York, NY, USA},
  url       = {https://doi.org/10.1145/3062341.3062363},
  doi       = {10.1145/3062341.3062363},
  abstract  = { The maturation of the Web platform has given rise to sophisticated and demanding Web applications such as interactive 3D visualization, audio and video software, and games. With that, efficiency and security of code on the Web has become more important than ever. Yet JavaScript as the only built-in language of the Web is not well-equipped to meet these requirements, especially as a compilation target.  Engineers from the four major browser vendors have risen to the challenge and collaboratively designed a portable low-level bytecode called WebAssembly. It offers compact representation, efficient validation and compilation, and safe low to no-overhead execution. Rather than committing to a specific programming model, WebAssembly is an abstraction over modern hardware, making it language-, hardware-, and platform-independent, with use cases beyond just the Web. WebAssembly has been designed with a formal semantics from the start. We describe the motivation, design and formal semantics of WebAssembly and provide some preliminary experience with implementations. },
  booktitle = {Proceedings of the 38th ACM SIGPLAN Conference on Programming Language Design and Implementation},
  pages     = {185–200},
  numpages  = {16},
  keywords  = {type systems, virtual machines, just-in-time compilers, assembly languages, programming languages},
  location  = {Barcelona, Spain},
  series    = {PLDI 2017}
}

@inproceedings{10.1145/3167082,
  author    = {Watt, Conrad},
  title     = {Mechanising and Verifying the {W}eb{A}ssembly Specification},
  year      = {2018},
  isbn      = {9781450355865},
  publisher = {Association for Computing Machinery},
  address   = {New York, NY, USA},
  url       = {https://doi.org/10.1145/3167082},
  doi       = {10.1145/3167082},
  abstract  = {WebAssembly is a new low-level language currently being implemented in all major web browsers. It is designed to become the universal compilation target for the web, obsoleting existing solutions in this area, such as asm.js and Native Client. The WebAssembly working group has incorporated formal techniques into the development of the language, but their efforts so far have focussed on pen and paper formal specification.We present a mechanised Isabelle specification for the WebAssembly language, together with a verified executable interpreter and type checker. Moreover, we present a fully mechanised proof of the soundness of the WebAssembly type system, and detail how our work on this proof has exposed several issues with the official WebAssembly specification, influencing its development. Finally, we give a brief account of our efforts in performing differential fuzzing of our interpreter against industry implementations.},
  booktitle = {Proceedings of the 7th ACM SIGPLAN International Conference on Certified Programs and Proofs},
  pages     = {53–65},
  numpages  = {13},
  keywords  = {bytecode, stack machine, soundness, reduction},
  location  = {Los Angeles, CA, USA},
  series    = {CPP 2018}
}

@inproceedings{10.1145/2048147.2048224,
  author    = {Zakai, Alon},
  title     = {Emscripten: An {LLVM}-to-{J}ava{S}cript Compiler},
  year      = {2011},
  isbn      = {9781450309424},
  publisher = {Association for Computing Machinery},
  address   = {New York, NY, USA},
  url       = {https://doi.org/10.1145/2048147.2048224},
  doi       = {10.1145/2048147.2048224},
  abstract  = {We present Emscripten, a compiler from LLVM (Low Level Virtual Machine) assembly to JavaScript. This opens up two avenues for running code written in languages other than JavaScript on the web: (1) Compile code directly into LLVM assembly, and then compile that into JavaScript using Emscripten, or (2) Compile a language's entire runtime into LLVM and then JavaScript, as in the previous approach, and then use the compiled runtime to run code written in that language. For example, the former approach can work for C and C++, while the latter can work for Python; all three examples open up new opportunities for running code on the web.Emscripten itself is written in JavaScript and is available under the MIT license (a permissive open source license), at http://www.emscripten.org. As a compiler from LLVM to JavaScript, the challenges in designing Emscripten are somewhat the reverse of the norm - one must go from a low-level assembly into a high-level language, and recreate parts of the original high-level structure of the code that were lost in the compilation to low-level LLVM. We detail the methods used in Emscripten to deal with those challenges, and in particular present and prove the validity of Emscripten's Relooper algorithm, which recreates high-level loop structures from low-level branching data.},
  booktitle = {Proceedings of the ACM International Conference Companion on Object Oriented Programming Systems Languages and Applications Companion},
  pages     = {301–312},
  numpages  = {12},
  keywords  = {llvm, javascript, decompiler},
  location  = {Portland, Oregon, USA},
  series    = {OOPSLA '11}
}

@inproceedings{234914,
  author    = {Abhinav Jangda and Bobby Powers and Emery D. Berger and Arjun Guha},
  title     = {Not So Fast: Analyzing the Performance of {W}eb{A}ssembly vs. Native Code},
  booktitle = {2019 {USENIX} Annual Technical Conference ({USENIX} {ATC} 19)},
  year      = {2019},
  isbn      = {978-1-939133-03-8},
  address   = {Renton, WA},
  pages     = {107--120},
  url       = {https://www.usenix.org/conference/atc19/presentation/jangda},
  publisher = {{USENIX} Association},
  month     = jul
}

@mastersthesis{llvm-thesis,
  author  = {Chris Lattner},
  title   = {{LLVM: An Infrastructure for Multi-Stage Optimization}},
  school  = {{Computer Science Dept., University of Illinois at Urbana-Champaign}},
  year    = {2002},
  address = {Urbana, IL},
  month   = {Dec}
}

@inproceedings{adler32-paper,
  author    = {C. {Fetzer}},
  booktitle = {International Conference on Dependable Systems and Networks (DSN'06)},
  title     = {Student Forum},
  year      = {2006},
  volume    = {},
  number    = {},
  pages     = {594-594},
  doi       = {10.1109/DSN.2006.68}
}

@article{ibm-ssa,
  author     = {Cytron, Ron and Ferrante, Jeanne and Rosen, Barry K. and Wegman, Mark N. and Zadeck, F. Kenneth},
  title      = {Efficiently Computing Static Single Assignment Form and the Control Dependence Graph},
  year       = {1991},
  issue_date = {Oct. 1991},
  publisher  = {Association for Computing Machinery},
  address    = {New York, NY, USA},
  volume     = {13},
  number     = {4},
  issn       = {0164-0925},
  url        = {https://doi.org/10.1145/115372.115320},
  doi        = {10.1145/115372.115320},
  journal    = {ACM Trans. Program. Lang. Syst.},
  month      = oct,
  pages      = {451–490},
  numpages   = {40},
  keywords   = {def-use chain, optimizing compilers, control flow graph, control dependence, dominator}
}

@inproceedings{trufflewasm,
  author    = {Salim, Salim S. and Nisbet, Andy and Luj\'{a}n, Mikel},
  title     = {Truffle{W}asm: A WebAssembly Interpreter on {G}raal{VM}},
  year      = {2020},
  isbn      = {9781450375542},
  publisher = {Association for Computing Machinery},
  address   = {New York, NY, USA},
  url       = {https://doi.org/10.1145/3381052.3381325},
  doi       = {10.1145/3381052.3381325},
  abstract  = {WebAssembly is a binary format originally designed for web-based deployment and execution combined with JavaScript. WebAssembly can also be used for standalone programs provided a WebAssembly runtime environment is available.This paper describes the design and implementation of TruffleWasm, a guest language implementation of a WebAssembly hosted on Truffle and GraalVM. Truffle is a Java framework capable of constructing and interpreting an Abstract Syntax Tree (AST) representing a program on standard JVMs. GraalVM is a JVM with a JIT compiler which optimises the execution of ASTs from Truffle.Our work is motivated by trying to understand the advantages and disadvantages of using GraalVM, and its support for multiple programming languages, to build a standalone WebAssembly runtime. This contrast with developing a new runtime, as Wasmtime and other projects are undertaking. TruffleWasm can execute standalone WebAssembly modules, while offering also interoperability with other GraalVM hosted languages, such as Java, JavaScript, R, Python and Ruby.The experimental results compare the peak performance of TruffleWasm to the standalone Wasmtime runtime for the Shootout, C benchmarks in JetStream, and the Poly-BenchC benchmarks. The results show the geo-mean peak performance of TruffleWasm is 4% slower than Wasmtime for Shootout/JetStream, and 4% faster for PolyBenchC.},
  booktitle = {Proceedings of the 16th ACM SIGPLAN/SIGOPS International Conference on Virtual Execution Environments},
  pages     = {88–100},
  numpages  = {13},
  keywords  = {WebAssembly, JVM, just in time compilation, Wasm, GraalVM},
  location  = {Lausanne, Switzerland},
  series    = {VEE '20}
}

@inproceedings{auto-vec-gcc,
  author    = {D. {Nuzman} and R. {Henderson}},
  booktitle = {International Symposium on Code Generation and Optimization (CGO'06)},
  title     = {Multi-platform auto-vectorization},
  year      = {2006},
  volume    = {},
  number    = {},
  pages     = {11 pp.-294},
  doi       = {10.1109/CGO.2006.25}
}

@article{sse-intel,
  author  = {S. K. {Raman} and V. {Pentkovski} and J. {Keshava}},
  journal = {IEEE Micro},
  title   = {Implementing streaming {SIMD} extensions on the {P}entium {III} processor},
  year    = {2000},
  volume  = {20},
  number  = {4},
  pages   = {47-57},
  doi     = {10.1109/40.865866}
}

@article{avx-intel,
  title   = {Intel {AVX}: New frontiers in performance improvements and energy efficiency},
  author  = {Firasta, Nadeem and Buxton, Mark and Jinbo, Paula and Nasri, Kaveh and Kuo, Shihjong},
  journal = {Intel white paper},
  volume  = {19},
  number  = {20},
  year    = {2008}
}

@inproceedings{arm-neon,
  author    = {Jang, Minwoo and Kim, Kukhyun and Kim, Kanghee},
  title     = {The Performance Analysis of {ARM NEON} Technology for Mobile Platforms},
  year      = {2011},
  isbn      = {9781450310871},
  publisher = {Association for Computing Machinery},
  address   = {New York, NY, USA},
  url       = {https://doi.org/10.1145/2103380.2103401},
  doi       = {10.1145/2103380.2103401},
  abstract  = {Mobile platforms have compute-bound applications, which will be executed on top of ARM processors in many cases. For such applications, the use of the ARM NEON technology should be considered since it can significantly increase the computing power of the underlying processor for vector operations. The ARM NEON technology is a media processing architecture based on SIMD (Single Instruction Multiple Data) that adds instructions targeted primarily for audio, video, and 3-D graphics processing. In this paper, we evaluate the ARM NEON technology in ARM Cortex-A8 processor for several open-source applications. To do this, we use the auto-vectorization feature of the ARM GCC compiler, which is implemented with the NEON technology. The evaluation criteria include program code size, execution time and power consumption, which are crucial in most embedded systems.},
  booktitle = {Proceedings of the 2011 ACM Symposium on Research in Applied Computation},
  pages     = {104–106},
  numpages  = {3},
  keywords  = {SIMD, NEON technology, embedded system, ARM cortex-A8, mobile platform},
  location  = {Miami, Florida},
  series    = {RACS '11}
}

@inproceedings{webassembly-survey,
  author    = {Musch, Marius
and Wressnegger, Christian
and Johns, Martin
and Rieck, Konrad},
  editor    = {Perdisci, Roberto
and Maurice, Cl{\'e}mentine
and Giacinto, Giorgio
and Almgren, Magnus},
  title     = {New Kid on the Web: A Study on the Prevalence of {W}eb{A}ssembly in the Wild},
  booktitle = {Detection of Intrusions and Malware, and Vulnerability Assessment},
  year      = {2019},
  publisher = {Springer International Publishing},
  address   = {Cham},
  pages     = {23--42},
  abstract  = {WebAssembly, or Wasm for short, is a new, low-level language that allows for near-native execution performance and is supported by all major browsers as of today. In comparison to JavaScript it offers faster transmission, parsing, and execution times. Up until now it has, however, been largely unclear what WebAssembly is used for in the wild. In this paper, we thus conduct the first large-scale study on the Web. For this, we examine the prevalence of WebAssembly in the Alexa Top 1 million websites and find that as many as 1 out of 600 sites execute Wasm code. Moreover, we perform several secondary analyses, including an evaluation of code characteristics and the assessment of a Wasm module's field of application. Based on this, we find that over 50 {\%} of all sites using WebAssembly apply it for malicious deeds, such as mining and obfuscation.},
  isbn      = {978-3-030-22038-9}
}


% On the Use of Web Assembly in a Serverless Context
@inproceedings{10.1007/978-3-030-58858-8_15,
  author    = {Murphy, Se{\'a}n
and Persaud, Leonardas
and Martini, William
and Bosshard, Bill},
  editor    = {Paasivaara, Maria
and Kruchten, Philippe},
  title     = {On the Use of {W}eb {A}ssembly in a Serverless Context},
  booktitle = {Agile Processes in Software Engineering and Extreme Programming -- Workshops},
  year      = {2020},
  publisher = {Springer International Publishing},
  address   = {Cham},
  pages     = {141--145},
  abstract  = {This paper considers how WASM can be run in different serverless contexts. A comparison of different serverside WASM runtime options is considered, specifically focused on wasmer, wasmtime and lucet. Next, different options for running WASM within two serverless platforms -- Openwhisk and AWS Lambdai -- are compared. Initial results show that a solution which uses the built-in node.js WASM supports is found to work better than using the dedicated WASM runtimes but this has limitations and providing more direct integration with WASM runtimes should be explored further.},
  isbn      = {978-3-030-58858-8}
}

% Valgrind
@inproceedings{valgrind-paper,
  author    = {Nethercote, Nicholas and Seward, Julian},
  title     = {Valgrind: A Framework for Heavyweight Dynamic Binary Instrumentation},
  year      = {2007},
  isbn      = {9781595936332},
  publisher = {Association for Computing Machinery},
  address   = {New York, NY, USA},
  url       = {https://doi.org/10.1145/1250734.1250746},
  doi       = {10.1145/1250734.1250746},
  abstract  = {Dynamic binary instrumentation (DBI) frameworks make it easy to build dynamic binary analysis (DBA) tools such as checkers and profilers. Much of the focus on DBI frameworks has been on performance; little attention has been paid to their capabilities. As a result, we believe the potential of DBI has not been fully exploited.In this paper we describe Valgrind, a DBI framework designed for building heavyweight DBA tools. We focus on its unique support for shadow values-a powerful but previously little-studied and difficult-to-implement DBA technique, which requires a tool to shadow every register and memory value with another value that describes it. This support accounts for several crucial design features that distinguish Valgrind from other DBI frameworks. Because of these features, lightweight tools built with Valgrind run comparatively slowly, but Valgrind can be used to build more interesting, heavyweight tools that are difficult or impossible to build with other DBI frameworks such as Pin and DynamoRIO.},
  booktitle = {Proceedings of the 28th ACM SIGPLAN Conference on Programming Language Design and Implementation},
  pages     = {89–100},
  numpages  = {12},
  keywords  = {dynamic binary instrumentation, Memcheck, shadow values, Valgrind, dynamic binary analysis},
  location  = {San Diego, California, USA},
  series    = {PLDI '07}
}

% Stack-based vs register based VM
@article{stack-and-register-vm,
  author     = {Shi, Yunhe and Casey, Kevin and Ertl, M. Anton and Gregg, David},
  title      = {Virtual Machine Showdown: Stack versus Registers},
  year       = {2008},
  issue_date = {January 2008},
  publisher  = {Association for Computing Machinery},
  address    = {New York, NY, USA},
  volume     = {4},
  number     = {4},
  issn       = {1544-3566},
  url        = {https://doi.org/10.1145/1328195.1328197},
  doi        = {10.1145/1328195.1328197},
  abstract   = {Virtual machines (VMs) enable the distribution of programs in an architecture-neutral format, which can easily be interpreted or compiled. A long-running question in the design of VMs is whether a stack architecture or register architecture can be implemented more efficiently with an interpreter. We extend existing work on comparing virtual stack and virtual register architectures in three ways. First, our translation from stack to register code and optimization are much more sophisticated. The result is that we eliminate an average of more than 46% of executed VM instructions, with the bytecode size of the register machine being only 26% larger than that of the corresponding stack one. Second, we present a fully functional virtual-register implementation of the Java virtual machine (JVM), which supports Intel, AMD64, PowerPC and Alpha processors. This register VM supports inline-threaded, direct-threaded, token-threaded, and switch dispatch. Third, we present experimental results on a range of additional optimizations such as register allocation and elimination of redundant heap loads. On the AMD64 architecture the register machine using switch dispatch achieves an average speedup of 1.48 over the corresponding stack machine. Even using the more efficient inline-threaded dispatch, the register VM achieves a speedup of 1.15 over the equivalent stack-based VM.},
  journal    = {ACM Trans. Archit. Code Optim.},
  month      = jan,
  articleno  = {2},
  numpages   = {36},
  keywords   = {register architecture, stack architecture, Interpreter, virtual machine}
}


@inproceedings{numba,
  author    = {Lam, Siu Kwan and Pitrou, Antoine and Seibert, Stanley},
  title     = {Numba: A {LLVM}-Based Python {JIT} Compiler},
  year      = {2015},
  isbn      = {9781450340052},
  publisher = {Association for Computing Machinery},
  address   = {New York, NY, USA},
  url       = {https://doi.org/10.1145/2833157.2833162},
  doi       = {10.1145/2833157.2833162},
  abstract  = {Dynamic, interpreted languages, like Python, are attractive for domain-experts and scientists experimenting with new ideas. However, the performance of the interpreter is often a barrier when scaling to larger data sets. This paper presents a just-in-time compiler for Python that focuses in scientific and array-oriented computing. Starting with the simple syntax of Python, Numba compiles a subset of the language into efficient machine code that is comparable in performance to a traditional compiled language. In addition, we share our experience in building a JIT compiler using LLVM[1].},
  booktitle = {Proceedings of the Second Workshop on the LLVM Compiler Infrastructure in HPC},
  articleno = {7},
  numpages  = {6},
  keywords  = {LLVM, Python, compiler},
  location  = {Austin, Texas},
  series    = {LLVM '15}
}

@inproceedings{mcsaf,
  author    = {Doherty, Jesse and Hendren, Laurie},
  title     = {Mc{SAF}: A Static Analysis Framework for {MATLAB}},
  year      = {2012},
  isbn      = {9783642310560},
  publisher = {Springer-Verlag},
  address   = {Berlin, Heidelberg},
  url       = {https://doi.org/10.1007/978-3-642-31057-7_7},
  doi       = {10.1007/978-3-642-31057-7_7},
  abstract  = {Matlab is an extremely popular programming language used by scientists, engineers, researchers and students world-wide. Despite its popularity, it has received very little attention from compiler researchers. This paper introduces McSaf, an open-source static analysis framework which is intended to enable more compiler research for Matlab and extensions of Matlab. The framework is based on an intermediate representation (IR) called McLast, which has been designed to capture all the key features of Matlab, while at the same time being simple for program analysis. The paper describes both the IR and the procedure for creating the IR from the higher-level AST. The analysis framework itself provides visitor-based traversals including fixed-point-based traversals to support both forwards and backwards analyses. McSaf has been implemented as part of the McLab project, and the framework has already been used for a variety of analyses, both for Matlab and the AspectMatlab extension.},
  booktitle = {Proceedings of the 26th European Conference on Object-Oriented Programming},
  pages     = {132–155},
  numpages  = {24},
  location  = {Beijing, China},
  series    = {ECOOP'12}
}

@article{tarjan-fast-dominator,
  author     = {Lengauer, Thomas and Tarjan, Robert Endre},
  title      = {A Fast Algorithm for Finding Dominators in a Flowgraph},
  year       = {1979},
  issue_date = {July 1979},
  publisher  = {Association for Computing Machinery},
  address    = {New York, NY, USA},
  volume     = {1},
  number     = {1},
  issn       = {0164-0925},
  url        = {https://doi.org/10.1145/357062.357071},
  doi        = {10.1145/357062.357071},
  abstract   = {A fast algorithm for finding dominators in a flowgraph is presented. The algorithm uses
depth-first search and an efficient method of computing functions defined on paths in trees. A simple implementation of the algorithm runs in O(m log n) time, where m is the number of edges and n is the number of vertices in the problem graph. A more sophisticated implementation runs in O(mα(m, n)) time, where α(m, n) is a functional inverse of Ackermann's function.Both versions of the algorithm were implemented in Algol W, a Stanford University version of Algol, and tested on an IBM 370/168. The programs were compared with an implementation by Purdom and Moore of a straightforward O(mn)-time algorithm, and with a bit vector algorithm described by Aho and Ullman. The fast algorithm beat the straightforward algorithm and the bit vector algorithm on all but the smallest graphs tested.},
  journal    = {ACM Trans. Program. Lang. Syst.},
  month      = jan,
  pages      = {121–141},
  numpages   = {21}
}

@article{tarjan-fast-dominator-improved,
  title    = {Finding Dominators in Practice},
  volume   = {10},
  issn     = {1526-1719},
  url      = {http://jgaa.info/getPaper?id=119},
  doi      = {10.7155/jgaa.00119},
  language = {en},
  number   = {1},
  urldate  = {2021-05-14},
  journal  = {Journal of Graph Algorithms and Applications},
  author   = {Georgiadis, Loukas and Tarjan, Robert E. and Werneck, Renato F.},
  year     = {2006},
  pages    = {69--94}
}

@article{peephole-opt,
  author     = {McKeeman, W. M.},
  title      = {Peephole Optimization},
  year       = {1965},
  issue_date = {July 1965},
  publisher  = {Association for Computing Machinery},
  address    = {New York, NY, USA},
  volume     = {8},
  number     = {7},
  issn       = {0001-0782},
  url        = {https://doi.org/10.1145/364995.365000},
  doi        = {10.1145/364995.365000},
  abstract   = {Redundant instructions may be discarded during the final stage of compilation by using a simple optimizing technique called peephole optimization. The method is described and examples are given.},
  journal    = {Commun. ACM},
  month      = jul,
  pages      = {443–444},
  numpages   = {2}
}

@article{alive,
  author     = {Lopes, Nuno P. and Menendez, David and Nagarakatte, Santosh and Regehr, John},
  title      = {Provably Correct Peephole Optimizations with Alive},
  year       = {2015},
  issue_date = {June 2015},
  publisher  = {Association for Computing Machinery},
  address    = {New York, NY, USA},
  volume     = {50},
  number     = {6},
  issn       = {0362-1340},
  url        = {https://doi.org/10.1145/2813885.2737965},
  doi        = {10.1145/2813885.2737965},
  abstract   = { Compilers should not miscompile. Our work addresses problems in developing peephole optimizations that perform local rewriting to improve the efficiency of LLVM code. These optimizations are individually difficult to get right, particularly in the presence of undefined behavior; taken together they represent a persistent source of bugs. This paper presents Alive, a domain-specific language for writing optimizations and for automatically either proving them correct or else generating counterexamples. Furthermore, Alive can be automatically translated into C++ code that is suitable for inclusion in an LLVM optimization pass. Alive is based on an attempt to balance usability and formal methods; for example, it captures---but largely hides---the detailed semantics of three different kinds of undefined behavior in LLVM. We have translated more than 300 LLVM optimizations into Alive and, in the process, found that eight of them were wrong. },
  journal    = {SIGPLAN Not.},
  month      = jun,
  pages      = {22–32},
  numpages   = {11},
  keywords   = {Compiler Verification, Peephole Optimization, Alive}
}

@inproceedings{alive-in-lean,
  author    = {Juneyoung Lee and
               Chung{-}Kil Hur and
               Nuno P. Lopes},
  editor    = {Isil Dillig and
               Serdar Tasiran},
  title     = {AliveInLean: A Verified {LLVM} Peephole Optimization Verifier},
  booktitle = {Computer Aided Verification - 31st International Conference, {CAV}
               2019, New York City, NY, USA, July 15-18, 2019, Proceedings, Part
               {II}},
  series    = {Lecture Notes in Computer Science},
  volume    = {11562},
  pages     = {445--455},
  publisher = {Springer},
  year      = {2019},
  url       = {https://doi.org/10.1007/978-3-030-25543-5\_25},
  doi       = {10.1007/978-3-030-25543-5\_25},
  timestamp = {Fri, 27 Mar 2020 08:45:57 +0100},
  biburl    = {https://dblp.org/rec/conf/cav/LeeHL19.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

@mastersthesis{alias-sable,
  title  = {A PRACTICAL INTERPROCEDURAL ALIAS ANALYSIS FOR AN OPTIMIZING/PARALLELIZING {C} COMPILER},
  school = {McGill University},
  author = {Maryam, Emami},
  year   = {1993}
}

@article{point-to-survey,
  title     = {Pointer analysis},
  author    = {Smaragdakis, Yannis and Balatsouras, George},
  journal   = {Foundations and Trends in Programming Languages},
  volume    = {2},
  number    = {1},
  pages     = {1--69},
  year      = {2015},
  publisher = {Now Publishers Inc. Hanover, MA, USA}
}

@inproceedings{point-to-microsoft,
  title        = {Polymorphic versus monomorphic flow-insensitive points-to analysis for {C}},
  author       = {Foster, Jeffrey S and F{\"a}hndrich, Manuel and Aiken, Alexander},
  booktitle    = {International Static Analysis Symposium},
  pages        = {175--198},
  year         = {2000},
  organization = {Springer}
}

@article{point-to-undecidable,
  title     = {The undecidability of aliasing},
  author    = {Ramalingam, Ganesan},
  journal   = {ACM Transactions on Programming Languages and Systems (TOPLAS)},
  volume    = {16},
  number    = {5},
  pages     = {1467--1471},
  year      = {1994},
  publisher = {ACM New York, NY, USA}
}

@inproceedings{fast-liveness,
  title     = {Fast liveness checking for {SSA}-form programs},
  author    = {Boissinot, Benoit and Hack, Sebastian and Grund, Daniel and Dupont de Dine hin, Beno{\^\i}t and Rastello, Fabri e},
  booktitle = {Proceedings of the 6th annual IEEE/ACM international symposium on Code generation and optimization},
  pages     = {35--44},
  year      = {2008}
}

@inproceedings{polybench,
  title     = {Understanding polybench/c 3.2 kernels},
  author    = {Yuki, Tomofumi},
  booktitle = {International workshop on Polyhedral Compilation Techniques (IMPACT)},
  pages     = {1--5},
  year      = {2014}
}

@inproceedings{ostrich,
  title     = {Numerical computing on the web: Benchmarking for the future},
  author    = {Herrera, David and Chen, Hanfeng and Lavoie, Erick and Hendren, Laurie},
  booktitle = {Proceedings of the 14th ACM SIGPLAN International Symposium on Dynamic Languages},
  pages     = {88--100},
  year      = {2018}
}

@techreport{npb,
  title       = {{NAS} parallel benchmarks version 2.4},
  author      = {Van der Wijngaart, Rob F and Wong, Parkson},
  year        = {2002},
  institution = {NASA Advanced Supercomputing (NAS) }
}

@inproceedings{webassembly-embedded,
  author    = {Scheidl, Fabian},
  booktitle = {2020 International Conference on Computing, Electronics   Communications Engineering (iCCECE)},
  title     = {Valent-Blocks: Scalable High-Performance Compilation of {W}eb{A}ssembly Bytecode For Embedded Systems},
  year      = {2020},
  volume    = {},
  number    = {},
  pages     = {119-124},
  doi       = {10.1109/iCCECE49321.2020.9231154}
}

@inproceedings{webassembly-wearables,
  title        = {Virtual machine execution for wearables based on webassembly},
  author       = {Jacobsson, Martin and Will{\'e}n, Jonas},
  booktitle    = {EAI International Conference on Body Area Networks},
  pages        = {381--389},
  year         = {2018},
  organization = {Springer}
}

@misc{webassembly-sgx,
  title         = {Twine: An Embedded Trusted Runtime for WebAssembly},
  author        = {Jämes Ménétrey and Marcelo Pasin and Pascal Felber and Valerio Schiavoni},
  year          = {2021},
  eprint        = {2103.15860},
  archiveprefix = {arXiv},
  primaryclass  = {cs.CR}
}

@article{arm-sve,
  title     = {The {ARM} Scalable Vector Extension},
  volume    = {37},
  issn      = {0272-1732},
  url       = {http://dx.doi.org/10.1109/MM.2017.35},
  doi       = {10.1109/mm.2017.35},
  number    = {2},
  journal   = {IEEE Micro},
  publisher = {Institute of Electrical and Electronics Engineers (IEEE)},
  author    = {Stephens, Nigel and Biles, Stuart and Boettcher, Matthias and Eapen, Jacob and Eyole, Mbou and Gabrielli, Giacomo and Horsnell, Matt and Magklis, Grigorios and Martinez, Alejandro and Premillieu, Nathanael and et al.},
  year      = {2017},
  month     = {Mar},
  pages     = {26–39}
}

@misc{sse-avx-speedup,
  title         = {Performance of {SSE} and {AVX} Instruction Sets},
  author        = {Hwancheol Jeong and Sunghoon Kim and Weonjong Lee and Seok-Ho Myung},
  year          = {2012},
  eprint        = {1211.0820},
  archiveprefix = {arXiv},
  primaryclass  = {hep-lat}
}

@article{polyhedral,
  author     = {Quiller\'{e}, Fabien and Rajopadhye, Sanjay},
  title      = {Optimizing Memory Usage in the Polyhedral Model},
  year       = {2000},
  issue_date = {Sept. 2000},
  publisher  = {Association for Computing Machinery},
  address    = {New York, NY, USA},
  volume     = {22},
  number     = {5},
  issn       = {0164-0925},
  url        = {https://doi.org/10.1145/365151.365152},
  doi        = {10.1145/365151.365152},
  abstract   = {The polyhedral model provides a single unified foundation for systolic array synthesis and automatic parallelization of loop programs. We investigate the problem of memory reuse when compiling Alpha (a functional language based on this model). Direct compilation would require unacceptably large memory (for example O(n3) for matrix multiplication). Researchers have previously addressed the problem of memory reuse, and the analysis that this entails for projective memory allocations. This paper addresses, for a given schedule, the choice of the projections so as to minimize the volume of the residual memory. We prove tight bounds on the number of linearly independent projection vectors. Our method is constructive, yielding an optimal memory allocation. We extend the method to modular functions, and deal with the subsequent problems of code generation. Our ideas are illustrated on a number of examples generated by the current version of the Alpha compiler.},
  journal    = {ACM Trans. Program. Lang. Syst.},
  month      = sep,
  pages      = {773–815},
  numpages   = {43},
  keywords   = {affine recurrence equations, data-parallel languages, dependence analysis, parallel code generation, automatic parallelization, dataflow analysis, memory management, scheduling, polyhedral model, applicative (functional) languages, lifetime analysis}
}

@article{llvm-vec-cost,
  title   = {Correlating cost with performance in {LLVM}},
  author  = {Pohl, Angela and Cosenza, Biagio and Juurlink, Ben},
  journal = {Proceedings of the 13th International Summer School on Advanced Computer Architecture and Compilation for High-Performance and Embedded Systems (ACACES)},
  year    = {2017}
}

@article{llvm-vec-cost-avx,
  title     = {Vectorization cost modeling for {NEON}, {AVX} and {SVE}},
  author    = {Pohl, Angela and Cosenza, Biagio and Juurlink, Ben},
  journal   = {Performance Evaluation},
  volume    = {140},
  pages     = {102106},
  year      = {2020},
  publisher = {Elsevier}
}

@inproceedings{polly,
  title     = {Polly-Polyhedral optimization in {LLVM}},
  author    = {Grosser, Tobias and Zheng, Hongbin and Aloor, Raghesh and Simb{\"u}rger, Andreas and Gr{\"o}{\ss}linger, Armin and Pouchet, Louis-No{\"e}l},
  booktitle = {Proceedings of the First International Workshop on Polyhedral Compilation Techniques (IMPACT)},
  volume    = {2011},
  pages     = {1},
  year      = {2011}
}

@article{auto-vec-eval,
  title     = {Evaluating Auto-Vectorizing Compilers through Objective Withdrawal of Useful Information},
  author    = {Siso, Sergi and Armour, Wes and Thiyagalingam, Jeyarajan},
  journal   = {ACM Transactions on Architecture and Code Optimization (TACO)},
  volume    = {16},
  number    = {4},
  pages     = {1--23},
  year      = {2019},
  publisher = {ACM New York, NY, USA}
}

@article{java-pgo,
  title     = {Online feedback-directed optimization of {J}ava},
  author    = {Arnold, Matthew and Hind, Michael and Ryder, Barbara G},
  journal   = {ACM SIGPLAN Notices},
  volume    = {37},
  number    = {11},
  pages     = {111--129},
  year      = {2002},
  publisher = {ACM New York, NY, USA}
}

@article{10.1145/358438.349320,
  author     = {Larsen, Samuel and Amarasinghe, Saman},
  title      = {Exploiting Superword Level Parallelism with Multimedia Instruction Sets},
  year       = {2000},
  issue_date = {May 2000},
  publisher  = {Association for Computing Machinery},
  address    = {New York, NY, USA},
  volume     = {35},
  number     = {5},
  issn       = {0362-1340},
  url        = {https://doi.org/10.1145/358438.349320},
  doi        = {10.1145/358438.349320},
  abstract   = {Increasing focus on multimedia applications has prompted the addition
of multimedia extensions to most existing general purpose microprocessors.  This added functionality comes primarily with the addition of short SIMD instructions.  Unfortunately, access to these instructions is limited to in-line assembly and library calls. Generally, it has been assumed that vector compilers provide the most promising means of exploiting multimedia instructions. Although vectorization technology is well understood, it is inherently complex and fragile. In addition, it is incapable of locating SIMD-style parallelism within a basic block.In this paper we introduce the concept of Superword Level Parallelism (SLP) ,a novel way of viewing parallelism in multimedia and scientific applications. We believe SLPP is  fundamentally different from the loop level parallelism exploited by traditional vector processing, and therefore demands a new method of extracting it.  We have developed a simple and robust compiler for detecting SLPP that targets basic blocks rather than loop nests.  As with techniques designed to extract ILP, ours is able to exploit parallelism both across loop iterations and within basic blocks. The result is an algorithm that provides excellent performance in several application domains. In our experiments, dynamic instruction counts were reduced by 46%. Speedups ranged from 1.24 to 6.70.},
  journal    = {SIGPLAN Not.},
  month      = may,
  pages      = {145–156},
  numpages   = {12}
}

@inproceedings{slp-vectorization,
  author    = {Larsen, Samuel and Amarasinghe, Saman},
  title     = {Exploiting Superword Level Parallelism with Multimedia Instruction Sets},
  year      = {2000},
  isbn      = {1581131992},
  publisher = {Association for Computing Machinery},
  address   = {New York, NY, USA},
  url       = {https://doi.org/10.1145/349299.349320},
  doi       = {10.1145/349299.349320},
  abstract  = {Increasing focus on multimedia applications has prompted the addition
of multimedia extensions to most existing general purpose microprocessors.  This added functionality comes primarily with the addition of short SIMD instructions.  Unfortunately, access to these instructions is limited to in-line assembly and library calls. Generally, it has been assumed that vector compilers provide the most promising means of exploiting multimedia instructions. Although vectorization technology is well understood, it is inherently complex and fragile. In addition, it is incapable of locating SIMD-style parallelism within a basic block.In this paper we introduce the concept of Superword Level Parallelism (SLP) ,a novel way of viewing parallelism in multimedia and scientific applications. We believe SLPP is  fundamentally different from the loop level parallelism exploited by traditional vector processing, and therefore demands a new method of extracting it.  We have developed a simple and robust compiler for detecting SLPP that targets basic blocks rather than loop nests.  As with techniques designed to extract ILP, ours is able to exploit parallelism both across loop iterations and within basic blocks. The result is an algorithm that provides excellent performance in several application domains. In our experiments, dynamic instruction counts were reduced by 46%. Speedups ranged from 1.24 to 6.70.},
  booktitle = {Proceedings of the ACM SIGPLAN 2000 Conference on Programming Language Design and Implementation},
  pages     = {145–156},
  numpages  = {12},
  location  = {Vancouver, British Columbia, Canada},
  series    = {PLDI '00}
}

@article{ml-vectorization,
  author     = {Stock, Kevin and Pouchet, Louis-No\"{e}l and Sadayappan, P.},
  title      = {Using Machine Learning to Improve Automatic Vectorization},
  year       = {2012},
  issue_date = {January 2012},
  publisher  = {Association for Computing Machinery},
  address    = {New York, NY, USA},
  volume     = {8},
  number     = {4},
  issn       = {1544-3566},
  url        = {https://doi.org/10.1145/2086696.2086729},
  doi        = {10.1145/2086696.2086729},
  abstract   = {Automatic vectorization is critical to enhancing performance of compute-intensive programs on modern processors. However, there is much room for improvement over the auto-vectorization capabilities of current production compilers through careful vector-code synthesis that utilizes a variety of loop transformations (e.g., unroll-and-jam, interchange, etc.).As the set of transformations considered is increased, the selection of the most effective combination of transformations becomes a significant challenge: Currently used cost models in vectorizing compilers are often unable to identify the best choices. In this paper, we address this problem using machine learning models to predict the performance of SIMD codes. In contrast to existing approaches that have used high-level features of the program, we develop machine learning models based on features extracted from the generated assembly code. The models are trained offline on a number of benchmarks and used at compile-time to discriminate between numerous possible vectorized variants generated from the input code.We demonstrate the effectiveness of the machine learning model by using it to guide automatic vectorization on a variety of tensor contraction kernels, with improvements ranging from 2\texttimes{} to 8\texttimes{} over Intel ICC's auto-vectorized code. We also evaluate the effectiveness of the model on a number of stencil computations and show good improvement over auto-vectorized code.},
  journal    = {ACM Trans. Archit. Code Optim.},
  month      = jan,
  articleno  = {50},
  numpages   = {23}
}