Skip to content
This repository was archived by the owner on Jan 21, 2026. It is now read-only.

Commit 4d4c8e6

Browse files
committed
Merge pull request #243 from nostrademons/pypi_fixes
Pypi fixes
2 parents 3a61e9a + 2db1796 commit 4d4c8e6

File tree

5 files changed

+243
-33
lines changed

5 files changed

+243
-33
lines changed

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,13 @@ test-suite.log
4848
/gumbo-[0-9].[0-9].tar.gz
4949
/gumbo-[0-9].[0-9]/
5050

51+
# Python dist artifacts
52+
*.pyc
53+
dist
54+
build
55+
python/gumbo.egg-info
56+
python/gumbo/libgumbo.so
57+
5158
# Example binaries
5259
clean_text
5360
find_links

Doxyfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ PROJECT_NAME = "Gumbo"
3232
# This could be handy for archiving the generated documentation or
3333
# if some version control system is used.
3434

35-
PROJECT_NUMBER = {{VERSION}}
35+
PROJECT_NUMBER = 0.9.1
3636

3737
# Using the PROJECT_BRIEF tag one can provide an optional one line description
3838
# for a project that appears at the top of each page and should give viewer

python/gumbo/__init__.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,5 +31,15 @@
3131
"""
3232

3333
from gumbo.gumboc import *
34-
from gumbo import html5lib_adapter as html5lib
35-
from gumbo.soup_adapter import parse as soup_parse
34+
35+
try:
36+
from gumbo import html5lib_adapter as html5lib
37+
except ImportError:
38+
# html5lib not installed
39+
pass
40+
41+
try:
42+
from gumbo.soup_adapter import parse as soup_parse
43+
except ImportError:
44+
# BeautifulSoup not installed
45+
pass

python/gumbo/gumboc.py

Lines changed: 58 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -24,9 +24,18 @@
2424

2525
import contextlib
2626
import ctypes
27-
27+
import os.path
2828

2929
try:
30+
# First look for a freshly-built .so in the .libs directory, for development.
31+
_dll = ctypes.cdll.LoadLibrary(os.path.join(
32+
os.path.dirname(__file__), '..', '..', '.libs', 'libgumbo.so'))
33+
except OSError:
34+
# PyPI or setuptools install, look in the current directory.
35+
_dll = ctypes.cdll.LoadLibrary(os.path.join(
36+
os.path.dirname(__file__), 'libgumbo.so'))
37+
except OSError:
38+
# System library, on unix
3039
_dll = ctypes.cdll.LoadLibrary('libgumbo.so')
3140
except OSError:
3241
# MacOS X
@@ -36,22 +45,31 @@
3645
_bitvector = ctypes.c_uint
3746
_Ptr = ctypes.POINTER
3847

39-
40-
class Enum(ctypes.c_uint):
41-
class __metaclass__(type(ctypes.c_uint)):
42-
def __new__(metaclass, name, bases, cls_dict):
43-
cls = type(ctypes.c_uint).__new__(metaclass, name, bases, cls_dict)
44-
if name == 'Enum':
45-
return cls
46-
try:
47-
for i, value in enumerate(cls_dict['_values_']):
48-
setattr(cls, value, cls.from_param(i))
49-
except KeyError:
50-
raise ValueError('No _values_ list found inside enum type.')
51-
except TypeError:
52-
raise ValueError('_values_ must be a list of names of enum constants.')
48+
class EnumMetaclass(type(ctypes.c_uint)):
49+
def __new__(metaclass, name, bases, cls_dict):
50+
cls = type(ctypes.c_uint).__new__(metaclass, name, bases, cls_dict)
51+
if name == 'Enum':
5352
return cls
54-
53+
try:
54+
for i, value in enumerate(cls_dict['_values_']):
55+
setattr(cls, value, cls.from_param(i))
56+
except KeyError:
57+
raise ValueError('No _values_ list found inside enum type.')
58+
except TypeError:
59+
raise ValueError('_values_ must be a list of names of enum constants.')
60+
return cls
61+
62+
def with_metaclass(mcls):
63+
def decorator(cls):
64+
body = vars(cls).copy()
65+
# clean out class body
66+
body.pop('__dict__', None)
67+
body.pop('__weakref__', None)
68+
return mcls(cls.__name__, cls.__bases__, body)
69+
return decorator
70+
71+
@with_metaclass(EnumMetaclass)
72+
class Enum(ctypes.c_uint):
5573
@classmethod
5674
def from_param(cls, param):
5775
if isinstance(param, Enum):
@@ -145,18 +163,30 @@ def __init__(self, vector):
145163
def __iter__(self):
146164
return self
147165

148-
def next(self):
166+
def __next__(self):
167+
# Python 3
149168
if self.current >= self.vector.length:
150169
raise StopIteration
151170
obj = self.vector[self.current]
152171
self.current += 1
153172
return obj
154173

174+
def next(self):
175+
# Python 2
176+
return self.__next__()
177+
155178
def __len__(self):
156179
return self.length
157180

158181
def __getitem__(self, i):
159-
if isinstance(i, (int, long)):
182+
try:
183+
# Python 2
184+
numeric_types = (int, long)
185+
except NameError:
186+
# Python 3
187+
numeric_types = int
188+
189+
if isinstance(i, numeric_types):
160190
if i < 0:
161191
i += self.length
162192
if i > self.length:
@@ -424,20 +454,25 @@ class NodeUnion(ctypes.Union):
424454
class Node(ctypes.Structure):
425455
# _fields_ set later to avoid a circular reference
426456

427-
@property
428-
def contents(self):
457+
def _contents(self):
458+
# Python3 enters an infinite loop if you use an @property within
459+
# __getattr__, so we factor it out to a helper.
429460
if self.type == NodeType.DOCUMENT:
430461
return self.v.document
431462
elif self.type == NodeType.ELEMENT:
432463
return self.v.element
433464
else:
434465
return self.v.text
435466

467+
@property
468+
def contents(self):
469+
return self._contents()
470+
436471
def __getattr__(self, name):
437-
return getattr(self.contents, name)
472+
return getattr(self._contents(), name)
438473

439474
def __setattr__(self, name, value):
440-
return setattr(self.contents, name, value)
475+
return setattr(self._contents(), name, value)
441476

442477
def __repr__(self):
443478
return repr(self.contents)
@@ -492,7 +527,7 @@ def parse(text, **kwargs):
492527
# outlives the parse output. If we let ctypes do it automatically on function
493528
# call, it creates a temporary buffer which is destroyed when the call
494529
# completes, and then the original_text pointers point into invalid memory.
495-
text_ptr = ctypes.c_char_p(text)
530+
text_ptr = ctypes.c_char_p(text.encode('utf-8'))
496531
output = _parse_with_options(ctypes.byref(options), text_ptr, len(text))
497532
try:
498533
yield output

setup.py

Lines changed: 165 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,177 @@
11
#!/usr/bin/env python
22
from setuptools import setup
3+
from setuptools.command.sdist import sdist
34

4-
def readme():
5-
with open('README.md') as f:
6-
return f.read()
5+
class CustomSdistCommand(sdist):
6+
"""Customized Sdist command, to copy libgumbo.so into the Python directory
7+
so that it can be installed with `pip install`."""
8+
def run(self):
9+
try:
10+
import shutil
11+
shutil.copyfile('.libs/libgumbo.so', 'python/gumbo/libgumbo.so')
12+
sdist.run(self)
13+
except IOError as e:
14+
print(e)
15+
16+
17+
README = '''Gumbo - A pure-C HTML5 parser.
18+
==============================
19+
20+
Gumbo is an implementation of the `HTML5 parsing algorithm <http://www.whatwg.org/specs/web-apps/current-work/multipage/#auto-toc-12>`_ implemented
21+
as a pure C99 library with no outside dependencies. It's designed to serve
22+
as a building block for other tools and libraries such as linters,
23+
validators, templating languages, and refactoring and analysis tools. This
24+
package contains the library itself, Python ctypes bindings for the library, and
25+
adapters for html5lib and BeautifulSoup (3.2) that give it the same API as those
26+
libaries.
27+
28+
Goals & features:
29+
-----------------
30+
31+
- Robust and resilient to bad input.
32+
33+
- Simple API that can be easily wrapped by other languages.
34+
35+
- Support for source locations and pointers back to the original text.
36+
37+
- Relatively lightweight, with no outside dependencies.
38+
39+
- Passes all `html5lib-0.95 tests <https://github.com/html5lib/html5lib-tests>`_.
40+
41+
- Tested on over 2.5 billion pages from Google's index.
42+
43+
Non-goals:
44+
----------
45+
46+
- Execution speed. Gumbo gains some of this by virtue of being written in
47+
C, but it is not an important consideration for the intended use-case, and
48+
was not a major design factor.
49+
50+
- Support for encodings other than UTF-8. For the most part, client code
51+
can convert the input stream to UTF-8 text using another library before
52+
processing.
53+
54+
- Security. Gumbo was initially designed for a product that worked with
55+
trusted input files only. We're working to harden this and make sure that it
56+
behaves as expected even on malicious input, but for now, Gumbo should only be
57+
run on trusted input or within a sandbox.
58+
59+
- C89 support. Most major compilers support C99 by now; the major exception
60+
(Microsoft Visual Studio) should be able to compile this in C++ mode with
61+
relatively few changes. (Bug reports welcome.)
62+
63+
Wishlist (aka "We couldn't get these into the original release, but are
64+
hoping to add them soon"):
65+
66+
- Support for recent HTML5 spec changes to support the template tag.
67+
68+
- Support for fragment parsing.
69+
70+
- Full-featured error reporting.
71+
72+
- Bindings in other languages.
73+
74+
Installation
75+
------------
76+
77+
```pip install gumbo``` should do it. If you have a local copy, ```python
78+
setup.py install``` from the root directory.
79+
80+
The `html5lib <https://pypi.python.org/pypi/html5lib/0.999>`_ and
81+
`BeautifulSoup <https://pypi.python.org/pypi/BeautifulSoup/3.2.1>`_ adapters
82+
require that their respective libraries be installed separately to work.
83+
84+
Basic Usage
85+
-----------
86+
87+
For the ctypes bindings:
88+
89+
.. code-block:: python
90+
91+
import gumbo
92+
93+
with gumbo.parse(text) as output:
94+
root = output.contents.root.contents
95+
# root is a Node object representing the root of the parse tree
96+
# tree-walk over it as necessary.
97+
98+
For the BeautifulSoup bindings:
99+
100+
.. code-block:: python
101+
102+
import gumbo
103+
104+
soup = gumbo.soup_parse(text)
105+
# soup is a BeautifulSoup object representing the parse tree.
106+
107+
For the html5lib bindings:
108+
109+
.. code-block:: python
110+
111+
from gumbo import html5lib
112+
113+
doc = html5lib.parse(text[, treebuilder='lxml'])
114+
115+
Recommended best-practice for Python usage is to use one of the adapters to
116+
an existing API (personally, I prefer BeautifulSoup) and write your program
117+
in terms of those. The raw CTypes bindings should be considered building
118+
blocks for higher-level libraries and rarely referenced directly.
119+
120+
See the source code, Pydoc, and implementation of soup_adapter and
121+
html5lib_adapter for more information.
122+
123+
A note on API/ABI compatibility
124+
-------------------------------
125+
126+
We'll make a best effort to preserve API compatibility between releases.
127+
The initial release is a 0.9 (beta) release to solicit comments from early
128+
adopters, but if no major problems are found with the API, a 1.0 release
129+
will follow shortly, and the API of that should be considered stable. If
130+
changes are necessary, we follow [semantic versioning][].
131+
132+
We make no such guarantees about the ABI, and it's very likely that
133+
subsequent versions may require a recompile of client code. For this
134+
reason, we recommend NOT using Gumbo data structures throughout a program,
135+
and instead limiting them to a translation layer that picks out whatever
136+
data is needed from the parse tree and then converts that to persistent
137+
data structures more appropriate for the application. The API is
138+
structured to encourage this use, with a single delete function for the
139+
whole parse tree, and is not designed with mutation in mind.
140+
141+
Most of this is transparent to Python usage, as the Python adapters are all
142+
built with this in mind. However, since ctypes requires ABI compatibility, it
143+
does mean you'll have to re-deploy the gumboc library and C extension when
144+
upgrading to a new version.
145+
'''
146+
147+
CLASSIFIERS = [
148+
'Development Status :: 4 - Beta',
149+
'Intended Audience :: Developers',
150+
'License :: OSI Approved :: Apache Software License',
151+
'Operating System :: Unix',
152+
'Operating System :: POSIX :: Linux',
153+
'Programming Language :: C',
154+
'Programming Language :: Python',
155+
'Programming Language :: Python :: 2',
156+
'Programming Language :: Python :: 2.7',
157+
'Programming Language :: Python :: 3',
158+
'Programming Language :: Python :: 3.4',
159+
'Topic :: Software Development :: Libraries :: Python Modules',
160+
'Topic :: Text Processing :: Markup :: HTML'
161+
]
7162

8163
setup(name='gumbo',
9-
version='{{VERSION}}',
164+
version='0.9.1',
10165
description='Python bindings for Gumbo HTML parser',
11-
long_description=readme(),
166+
long_description=README,
12167
url='http://github.com/google/gumbo-parser',
13168
keywords='gumbo html html5 parser google html5lib beautifulsoup',
14169
author='Jonathan Tang',
15-
author_email='jdtang@google.com',
170+
author_email='jonathan.d.tang@gmail.com',
16171
license='Apache 2.0',
172+
classifiers=CLASSIFIERS,
17173
packages=['gumbo'],
18174
package_dir={'': 'python'},
19-
zip_safe=True)
175+
package_data={'gumbo': ['libgumbo.so']},
176+
cmdclass={ 'sdist': CustomSdistCommand },
177+
zip_safe=False)

0 commit comments

Comments
 (0)