|
1 | 1 | #!/usr/bin/env python |
2 | 2 | from setuptools import setup |
| 3 | +from setuptools.command.sdist import sdist |
3 | 4 |
|
4 | | -def readme(): |
5 | | - with open('README.md') as f: |
6 | | - return f.read() |
| 5 | +class CustomSdistCommand(sdist): |
| 6 | + """Customized Sdist command, to copy libgumbo.so into the Python directory |
| 7 | + so that it can be installed with `pip install`.""" |
| 8 | + def run(self): |
| 9 | + try: |
| 10 | + import shutil |
| 11 | + shutil.copyfile('.libs/libgumbo.so', 'python/gumbo/libgumbo.so') |
| 12 | + sdist.run(self) |
| 13 | + except IOError as e: |
| 14 | + print(e) |
| 15 | + |
| 16 | + |
| 17 | +README = '''Gumbo - A pure-C HTML5 parser. |
| 18 | +============================== |
| 19 | +
|
| 20 | +Gumbo is an implementation of the `HTML5 parsing algorithm <http://www.whatwg.org/specs/web-apps/current-work/multipage/#auto-toc-12>`_ implemented |
| 21 | +as a pure C99 library with no outside dependencies. It's designed to serve |
| 22 | +as a building block for other tools and libraries such as linters, |
| 23 | +validators, templating languages, and refactoring and analysis tools. This |
| 24 | +package contains the library itself, Python ctypes bindings for the library, and |
| 25 | +adapters for html5lib and BeautifulSoup (3.2) that give it the same API as those |
| 26 | +libaries. |
| 27 | +
|
| 28 | +Goals & features: |
| 29 | +----------------- |
| 30 | +
|
| 31 | +- Robust and resilient to bad input. |
| 32 | +
|
| 33 | +- Simple API that can be easily wrapped by other languages. |
| 34 | +
|
| 35 | +- Support for source locations and pointers back to the original text. |
| 36 | +
|
| 37 | +- Relatively lightweight, with no outside dependencies. |
| 38 | +
|
| 39 | +- Passes all `html5lib-0.95 tests <https://github.com/html5lib/html5lib-tests>`_. |
| 40 | +
|
| 41 | +- Tested on over 2.5 billion pages from Google's index. |
| 42 | +
|
| 43 | +Non-goals: |
| 44 | +---------- |
| 45 | +
|
| 46 | +- Execution speed. Gumbo gains some of this by virtue of being written in |
| 47 | + C, but it is not an important consideration for the intended use-case, and |
| 48 | + was not a major design factor. |
| 49 | +
|
| 50 | +- Support for encodings other than UTF-8. For the most part, client code |
| 51 | + can convert the input stream to UTF-8 text using another library before |
| 52 | + processing. |
| 53 | +
|
| 54 | +- Security. Gumbo was initially designed for a product that worked with |
| 55 | + trusted input files only. We're working to harden this and make sure that it |
| 56 | + behaves as expected even on malicious input, but for now, Gumbo should only be |
| 57 | + run on trusted input or within a sandbox. |
| 58 | +
|
| 59 | +- C89 support. Most major compilers support C99 by now; the major exception |
| 60 | + (Microsoft Visual Studio) should be able to compile this in C++ mode with |
| 61 | + relatively few changes. (Bug reports welcome.) |
| 62 | +
|
| 63 | +Wishlist (aka "We couldn't get these into the original release, but are |
| 64 | +hoping to add them soon"): |
| 65 | +
|
| 66 | +- Support for recent HTML5 spec changes to support the template tag. |
| 67 | +
|
| 68 | +- Support for fragment parsing. |
| 69 | +
|
| 70 | +- Full-featured error reporting. |
| 71 | +
|
| 72 | +- Bindings in other languages. |
| 73 | +
|
| 74 | +Installation |
| 75 | +------------ |
| 76 | +
|
| 77 | +```pip install gumbo``` should do it. If you have a local copy, ```python |
| 78 | +setup.py install``` from the root directory. |
| 79 | +
|
| 80 | +The `html5lib <https://pypi.python.org/pypi/html5lib/0.999>`_ and |
| 81 | +`BeautifulSoup <https://pypi.python.org/pypi/BeautifulSoup/3.2.1>`_ adapters |
| 82 | +require that their respective libraries be installed separately to work. |
| 83 | +
|
| 84 | +Basic Usage |
| 85 | +----------- |
| 86 | +
|
| 87 | +For the ctypes bindings: |
| 88 | +
|
| 89 | +.. code-block:: python |
| 90 | +
|
| 91 | + import gumbo |
| 92 | + |
| 93 | + with gumbo.parse(text) as output: |
| 94 | + root = output.contents.root.contents |
| 95 | + # root is a Node object representing the root of the parse tree |
| 96 | + # tree-walk over it as necessary. |
| 97 | +
|
| 98 | +For the BeautifulSoup bindings: |
| 99 | +
|
| 100 | +.. code-block:: python |
| 101 | +
|
| 102 | + import gumbo |
| 103 | +
|
| 104 | + soup = gumbo.soup_parse(text) |
| 105 | + # soup is a BeautifulSoup object representing the parse tree. |
| 106 | +
|
| 107 | +For the html5lib bindings: |
| 108 | +
|
| 109 | +.. code-block:: python |
| 110 | +
|
| 111 | + from gumbo import html5lib |
| 112 | +
|
| 113 | + doc = html5lib.parse(text[, treebuilder='lxml']) |
| 114 | +
|
| 115 | +Recommended best-practice for Python usage is to use one of the adapters to |
| 116 | +an existing API (personally, I prefer BeautifulSoup) and write your program |
| 117 | +in terms of those. The raw CTypes bindings should be considered building |
| 118 | +blocks for higher-level libraries and rarely referenced directly. |
| 119 | +
|
| 120 | +See the source code, Pydoc, and implementation of soup_adapter and |
| 121 | +html5lib_adapter for more information. |
| 122 | +
|
| 123 | +A note on API/ABI compatibility |
| 124 | +------------------------------- |
| 125 | +
|
| 126 | +We'll make a best effort to preserve API compatibility between releases. |
| 127 | +The initial release is a 0.9 (beta) release to solicit comments from early |
| 128 | +adopters, but if no major problems are found with the API, a 1.0 release |
| 129 | +will follow shortly, and the API of that should be considered stable. If |
| 130 | +changes are necessary, we follow [semantic versioning][]. |
| 131 | +
|
| 132 | +We make no such guarantees about the ABI, and it's very likely that |
| 133 | +subsequent versions may require a recompile of client code. For this |
| 134 | +reason, we recommend NOT using Gumbo data structures throughout a program, |
| 135 | +and instead limiting them to a translation layer that picks out whatever |
| 136 | +data is needed from the parse tree and then converts that to persistent |
| 137 | +data structures more appropriate for the application. The API is |
| 138 | +structured to encourage this use, with a single delete function for the |
| 139 | +whole parse tree, and is not designed with mutation in mind. |
| 140 | +
|
| 141 | +Most of this is transparent to Python usage, as the Python adapters are all |
| 142 | +built with this in mind. However, since ctypes requires ABI compatibility, it |
| 143 | +does mean you'll have to re-deploy the gumboc library and C extension when |
| 144 | +upgrading to a new version. |
| 145 | +''' |
| 146 | + |
| 147 | +CLASSIFIERS = [ |
| 148 | + 'Development Status :: 4 - Beta', |
| 149 | + 'Intended Audience :: Developers', |
| 150 | + 'License :: OSI Approved :: Apache Software License', |
| 151 | + 'Operating System :: Unix', |
| 152 | + 'Operating System :: POSIX :: Linux', |
| 153 | + 'Programming Language :: C', |
| 154 | + 'Programming Language :: Python', |
| 155 | + 'Programming Language :: Python :: 2', |
| 156 | + 'Programming Language :: Python :: 2.7', |
| 157 | + 'Programming Language :: Python :: 3', |
| 158 | + 'Programming Language :: Python :: 3.4', |
| 159 | + 'Topic :: Software Development :: Libraries :: Python Modules', |
| 160 | + 'Topic :: Text Processing :: Markup :: HTML' |
| 161 | +] |
7 | 162 |
|
8 | 163 | setup(name='gumbo', |
9 | | - version='{{VERSION}}', |
| 164 | + version='0.9.1', |
10 | 165 | description='Python bindings for Gumbo HTML parser', |
11 | | - long_description=readme(), |
| 166 | + long_description=README, |
12 | 167 | url='http://github.com/google/gumbo-parser', |
13 | 168 | keywords='gumbo html html5 parser google html5lib beautifulsoup', |
14 | 169 | author='Jonathan Tang', |
15 | | - author_email='jdtang@google.com', |
| 170 | + author_email='jonathan.d.tang@gmail.com', |
16 | 171 | license='Apache 2.0', |
| 172 | + classifiers=CLASSIFIERS, |
17 | 173 | packages=['gumbo'], |
18 | 174 | package_dir={'': 'python'}, |
19 | | - zip_safe=True) |
| 175 | + package_data={'gumbo': ['libgumbo.so']}, |
| 176 | + cmdclass={ 'sdist': CustomSdistCommand }, |
| 177 | + zip_safe=False) |
0 commit comments