Skip to content

Commit 29b5ff9

Browse files
committed
Merge branch 'joss-paper'
2 parents 5007b63 + 9337b41 commit 29b5ff9

File tree

2 files changed

+192
-0
lines changed

2 files changed

+192
-0
lines changed

docs/paper.bib

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
@manual{ python,
2+
title = {Python Programming Language},
3+
author = {Python Core Developers},
4+
organization = {Python Software Foundation},
5+
year = {1995},
6+
url = {https://www.python.org/},
7+
}
8+
9+
@inproceedings{ angr-shoshitaishvili2016state,
10+
title = {{SoK: (State of) The Art of War: Offensive Techniques in Binary Analysis}},
11+
author = {Shoshitaishvili, Yan and Wang, Ruoyu and Salls, Christopher and Stephens, Nick and Polino, Mario and Dutcher, Audrey and Grosen, John and Feng, Siji and Hauser, Christophe and Kruegel, Christopher and Vigna, Giovanni},
12+
booktitle = {IEEE Symposium on Security and Privacy},
13+
year = {2016}
14+
}
15+
16+
@article{ astropy:2013,
17+
adsnote = {Provided by the SAO/NASA Astrophysics Data System},
18+
adsurl = {http://adsabs.harvard.edu/abs/2013A%26A...558A..33A},
19+
archiveprefix = {arXiv},
20+
author = {{Astropy Collaboration} and {Robitaille}, T.~P. and {Tollerud}, E.~J. and {Greenfield}, P. and {Droettboom}, M. and {Bray}, E. and {Aldcroft}, T. and {Davis}, M. and {Ginsburg}, A. and {Price-Whelan}, A.~M. and {Kerzendorf}, W.~E. and {Conley}, A. and {Crighton}, N. and {Barbary}, K. and {Muna}, D. and {Ferguson}, H. and {Grollier}, F. and {Parikh}, M.~M. and {Nair}, P.~H. and {Unther}, H.~M. and {Deil}, C. and {Woillez}, J. and {Conseil}, S. and {Kramer}, R. and {Turner}, J.~E.~H. and {Singer}, L. and {Fox}, R. and {Weaver}, B.~A. and {Zabalza}, V. and {Edwards}, Z.~I. and {Azalee Bostroem}, K. and {Burke}, D.~J. and {Casey}, A.~R. and {Crawford}, S.~M. and {Dencheva}, N. and {Ely}, J. and {Jenness}, T. and {Labrie}, K. and {Lim}, P.~L. and {Pierfederici}, F. and {Pontzen}, A. and {Ptak}, A. and {Refsdal}, B. and {Servillat}, M. and {Streicher}, O.},
21+
doi = {10.1051/0004-6361/201322068},
22+
eid = {A33},
23+
eprint = {1307.6212},
24+
journal = {\aap},
25+
keywords = {methods: data analysis, methods: miscellaneous, virtual observatory tools},
26+
month = Oct,
27+
pages = {A33},
28+
primaryclass = {astro-ph.IM},
29+
title = {{Astropy: A community Python package for astronomy}},
30+
volume = 558,
31+
year = 2013,
32+
bdsk-url-1 = {https://dx.doi.org/10.1051/0004-6361/201322068}
33+
}
34+
35+
@article{ astropy:2018,
36+
adsnote = {Provided by the SAO/NASA Astrophysics Data System},
37+
adsurl = {https://ui.adsabs.harvard.edu/#abs/2018AJ....156..123T},
38+
author = {{Price-Whelan}, A.~M. and {Sip{\H{o}}cz}, B.~M. and {G{\"u}nther}, H.~M. and {Lim}, P.~L. and {Crawford}, S.~M. and {Conseil}, S. and {Shupe}, D.~L. and {Craig}, M.~W. and {Dencheva}, N. and {Ginsburg}, A. and {VanderPlas}, J.~T. and {Bradley}, L.~D. and {P{\'e}rez-Su{\'a}rez}, D. and {de Val-Borro}, M. and {Paper Contributors}, (Primary and {Aldcroft}, T.~L. and {Cruz}, K.~L. and {Robitaille}, T.~P. and {Tollerud}, E.~J. and {Coordination Committee}, (Astropy and {Ardelean}, C. and {Babej}, T. and {Bach}, Y.~P. and {Bachetti}, M. and {Bakanov}, A.~V. and {Bamford}, S.~P. and {Barentsen}, G. and {Barmby}, P. and {Baumbach}, A. and {Berry}, K.~L. and {Biscani}, F. and {Boquien}, M. and {Bostroem}, K.~A. and {Bouma}, L.~G. and {Brammer}, G.~B. and {Bray}, E.~M. and {Breytenbach}, H. and {Buddelmeijer}, H. and {Burke}, D.~J. and {Calderone}, G. and {Cano Rodr{\'\i}guez}, J.~L. and {Cara}, M. and {Cardoso}, J.~V.~M. and {Cheedella}, S. and {Copin}, Y. and {Corrales}, L. and {Crichton}, D. and {D{\textquoteright}Avella}, D. and {Deil}, C. and {Depagne}, {\'E}. and {Dietrich}, J.~P. and {Donath}, A. and {Droettboom}, M. and {Earl}, N. and {Erben}, T. and {Fabbro}, S. and {Ferreira}, L.~A. and {Finethy}, T. and {Fox}, R.~T. and {Garrison}, L.~H. and {Gibbons}, S.~L.~J. and {Goldstein}, D.~A. and {Gommers}, R. and {Greco}, J.~P. and {Greenfield}, P. and {Groener}, A.~M. and {Grollier}, F. and {Hagen}, A. and {Hirst}, P. and {Homeier}, D. and {Horton}, A.~J. and {Hosseinzadeh}, G. and {Hu}, L. and {Hunkeler}, J.~S. and {Ivezi{\'c}}, {\v{Z}}. and {Jain}, A. and {Jenness}, T. and {Kanarek}, G. and {Kendrew}, S. and {Kern}, N.~S. and {Kerzendorf}, W.~E. and {Khvalko}, A. and {King}, J. and {Kirkby}, D. and {Kulkarni}, A.~M. and {Kumar}, A. and {Lee}, A. and {Lenz}, D. and {Littlefair}, S.~P. and {Ma}, Z. and {Macleod}, D.~M. and {Mastropietro}, M. and {McCully}, C. and {Montagnac}, S. and {Morris}, B.~M. and {Mueller}, M. and {Mumford}, S.~J. and {Muna}, D. and {Murphy}, N.~A. and {Nelson}, S. and {Nguyen}, G.~H. and {Ninan}, J.~P. and {N{\"o}the}, M. and {Ogaz}, S. and {Oh}, S. and {Parejko}, J.~K. and {Parley}, N. and {Pascual}, S. and {Patil}, R. and {Patil}, A.~A. and {Plunkett}, A.~L. and {Prochaska}, J.~X. and {Rastogi}, T. and {Reddy Janga}, V. and {Sabater}, J. and {Sakurikar}, P. and {Seifert}, M. and {Sherbert}, L.~E. and {Sherwood-Taylor}, H. and {Shih}, A.~Y. and {Sick}, J. and {Silbiger}, M.~T. and {Singanamalla}, S. and {Singer}, L.~P. and {Sladen}, P.~H. and {Sooley}, K.~A. and {Sornarajah}, S. and {Streicher}, O. and {Teuben}, P. and {Thomas}, S.~W. and {Tremblay}, G.~R. and {Turner}, J.~E.~H. and {Terr{\'o}n}, V. and {van Kerkwijk}, M.~H. and {de la Vega}, A. and {Watkins}, L.~L. and {Weaver}, B.~A. and {Whitmore}, J.~B. and {Woillez}, J. and {Zabalza}, V. and {Contributors}, (Astropy},
39+
doi = {10.3847/1538-3881/aabc4f},
40+
eid = {123},
41+
journal = {\aj},
42+
keywords = {methods: data analysis, methods: miscellaneous, methods: statistical, reference systems, Astrophysics - Instrumentation and Methods for Astrophysics},
43+
month = Sep,
44+
pages = {123},
45+
primaryclass = {astro-ph.IM},
46+
title = {{The Astropy Project: Building an Open-science Project and Status of the v2.0 Core Package}},
47+
volume = {156},
48+
year = 2018,
49+
bdsk-url-1 = {https://doi.org/10.3847/1538-3881/aabc4f}
50+
}
51+
52+
@misc{ blist,
53+
author = {Stutzbach, Daniel},
54+
title = {blist: an asymptotically faster list-like type for Python},
55+
year = {2014},
56+
howpublished = "\url{http://stutzbachenterprises.com/blist/}",
57+
note = "[Online; accessed 5-March-2019]"
58+
}
59+
60+
@misc{ bintrees,
61+
author = {Moitzi, Manfred},
62+
title = {bintrees: Binary Tree Package},
63+
year = {2017},
64+
howpublished = "\url{https://github.com/mozman/bintrees}",
65+
note = "[Online; accessed 5-March-2019]"
66+
}
67+
68+
@misc{ dan-stromberg,
69+
author = {Stromberg, Dan},
70+
title = {Dictionary-like Trees},
71+
year = {2019},
72+
howpublished = "\url{http://stromberg.dnsalias.org/~dstromberg/datastructures/}",
73+
note = "[Online; accessed 5-March-2019]"
74+
}
75+
76+
@inproceedings{ dask-matthew_rocklin-proc-scipy-2015,
77+
author = {Rocklin, Matthew},
78+
title = {Dask: Parallel Computation with Blocked algorithms and Task Scheduling},
79+
booktitle = {Proceedings of the 14th Python in Science Conference},
80+
pages = {130 - 136},
81+
year = {2015},
82+
editor = {Kathryn Huff and James Bergstra}
83+
}
84+
85+
@misc{ sortedcollection,
86+
author = {Hettinger, Raymond},
87+
title = {SortedCollection (Python Recipe)},
88+
year = {2010},
89+
howpublished = "\url{http://code.activestate.com/recipes/577197-sortedcollection/}",
90+
note = "[Online; accessed 5-March-2019]"
91+
}
92+
93+
@misc{ trio,
94+
author = {Smith, Nathaniel J.},
95+
title = {Trio: async programming for humans and snake people},
96+
year = {2017},
97+
howpublished = "\url{https://trio.readthedocs.io/}",
98+
note = "[Online; accessed 5-March-2019]"
99+
}
100+
101+
@misc{ zipline,
102+
author = {Hebert, Eddie and Sanderson, Scott and Jevnik, Joe and Frank, Richard and Wiecki, Thomas and others},
103+
title = {Zipline, a Pythonic Algorithmic Trading Library},
104+
year = {2016},
105+
howpublished = "\url{http://www.zipline.io/}",
106+
note = "[Online; accessed 5-March-2019]"
107+
}

docs/paper.md

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
---
2+
title: 'Python Sorted Containers'
3+
tags:
4+
- Python
5+
- sorted
6+
- list
7+
- dictionary
8+
- set
9+
authors:
10+
- name: Grant Jenks
11+
orcid: 0000-0001-5010-405X
12+
date: 5 March 2019
13+
bibliography: paper.bib
14+
---
15+
16+
# Summary
17+
18+
The standard library of popular languages like C++, Java, and C# provide sorted
19+
container data types based on binary tree data structures. While Python has
20+
risen in popularity, the Standard Library still lacks these common data types.
21+
Part of the challenge has been Python's rich object model which makes binary
22+
trees implemented in Python slow in terms of both memory and processor
23+
usage. To overcome the overhead of the interpreter, C-extensions are used by
24+
the Python core developers. In doing so, flexibility is tradeoff for
25+
performance. The goal of the Python core developers is to provide the right set
26+
of high-level APIs so that algorithms and data structures can be implemented
27+
efficiently. The Python Sorted Containers library uses Python's high-level APIs
28+
to efficiently implement sorted container data structures.
29+
30+
Python's collections data structures are based on three data types: sequences,
31+
mappings, and sets. These data types are implemented and most commonly used as
32+
list, dictionary, and set objects. The Python Sorted Containers library
33+
introduces new variants of these three data types that are each sorted: sorted
34+
list, sorted dictionary, and sorted set. In each case, the original semantics
35+
are extended to preserve sorted order of the contained elements with respect to
36+
mutating operations. When unable to preserve the sorted order constraint, the
37+
functionality is either non-existent or an error is raised from the
38+
library. Python's "sorted" built-in function also supports a "key" parameter
39+
which specifies a callable used to extract a comparison key from elements. When
40+
initializing a sorted container data type, the key paramemter is likewise
41+
supported.
42+
43+
Internally, Python Sorted Containers uses a list of sublists data structure
44+
that is like a B-tree contstrained to two levels of nodes. The maximum of each
45+
sublist is maintained in a separate list. To lookup an element, the list of
46+
maximums is bisected using the "bisect" module in the Standard Library. Using
47+
the bisected maximums index, the corresponding sublist is bisected to find the
48+
index of the desired element. To index the k'th element, a separate positional
49+
index, known as the "Jenks" index, is built and maintained. The positional
50+
index is like the order statistic of a binary tree densely packed into a
51+
list. By maintaining the size of the sublists as proportional to the
52+
$\sqrt[3]{n}$ the amortized time complexity of all operations is
53+
$O(\sqrt[3]{n})$. This bound works well for up to billions of elements which
54+
often exhausts all available memory.
55+
56+
Python Sorted Containers overlaps and extends the "bisect" and "heapq" modules
57+
provided in the Standard Library. In contrast to the function-oriented
58+
interface provided by these modules, Sorted Containers provides an
59+
object-oriented interface. Externally, SQLite in-memory indexes, Pandas
60+
DataFrame indexes, and Redis sorted sets provide similar functionality. These
61+
data structures are applied in priority queue, multiset, nearest neighbors,
62+
intervals, and ranking algorithms. Sorted Containers is used by scientific open
63+
source projects such as: Angr, a binary analysis platform from UC Santa
64+
Barbara; Astropy, a community Python package for astronomy; Dask Distributed, a
65+
library for dynamic task scheduling by Anaconda; Trio, an asynchronous I/O
66+
library; and Zipline, an algorithmic trading library by Quantopian.
67+
68+
# Acknowledgements
69+
70+
Thank you to Daniel Stutzbach for the "blist" software project to which Sorted
71+
Containers owes much of the original interface design.
72+
73+
Thank you to Raymond Hettinger for the "SortedCollection" recipe which
74+
originally inspired the support and design of the "key" parameter feature.
75+
76+
Thank you to Manfred Moitzi for the "bintrees" software project which motivated
77+
the range-based tree traversal interfaces.
78+
79+
Thank you to Dan Stromberg for the benchmark comparisons of less common binary
80+
tree data structures like treap, splay, and scapegoat.
81+
82+
Thank you to the open source community that has contributed bug reports,
83+
documentation improvements, and feature guidance in development of the project.
84+
85+
# References

0 commit comments

Comments
 (0)