release v1.0.0, tested & improved

el1s7 · el1s7 · commit 7b94b3bb0a4e · 2025-01-21T18:38:55.000+01:00
diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
@@ -0,0 +1,37 @@
+name: Build and Release Python Project
+
+on:
+  push:
+    tags:
+      - 'v*' # Triggers on version tags
+
+jobs:
+  build-and-release:
+    runs-on: ubuntu-latest
+
+    steps:
+      # Step 1: Checkout the repository
+      - name: Checkout Code
+        uses: actions/checkout@v4
+
+      # Step 2: Set up Python
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: '3.9' # Use your desired Python version
+
+      # Step 3: Install build dependencies
+      - name: Install build tools
+        run: pip install build --upgrade
+
+      # Step 4: Build the project
+      - name: Build the Python Project
+        run: python -m build
+
+      # Step 5: Create a GitHub Release
+      - name: Create GitHub Release
+        uses: softprops/action-gh-release@v2
+        with:
+          # Upload all artifacts in the dist/ folder 
+          files: |
+            dist/* 
diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,5 @@ __pycache__
 *.pyd
 .venv
 *.egg-info
-dist
+dist
+other
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,3 +1,4 @@
 exclude tests/*
 exclude .pytest_cache
-exclude .venv
+exclude .venv
+exclude other/*
diff --git a/README.md b/README.md
@@ -1,14 +1,17 @@
 # Curl Adapter
-A module that plugs straight-in to the python *[requests](https://github.com/psf/requests)* library and replaces the default *urllib3* HTTP adapter with cURL.
+![PyPI - Downloads](https://img.shields.io/pypi/dw/curl-adapter)
+
+A module that plugs straight-in to the python *[requests](https://github.com/psf/requests)* library and replaces the default *urllib3* HTTP adapter with **cURL**,  equipped with TLS fingerprint changing capabilities.
 
 ## Why?
 
 Specifically, this module is meant to be used with the "curl impersonate" python bindings ([lexiforest/curl_cffi](https://github.com/lexiforest/curl_cffi)), in order to send HTTP requests with custom, browser-like TLS & HTTP/2 fingerprints for bypassing sites that detect and block normal python requests (such as [Cloudflare](https://www.nstbrowser.io/en/blog/how-does-cloudflare-detect-bots) for example).
+
 <details>
   <summary>Note</summary>
 Even though <i><a href="https://github.com/lexiforest/curl_cffi">curl_cffi</a></i> already has an API that *mimicks* the <i>requests</i>  library, it comes with some compatibility issues (e.g. response.raw not available, response.history, differences in headers, cookies, json, etc.).
 <br><br>
-    With curl adapter, instead of copying and mimicking the <i>requests</i> library API, just the low level HTTP adapter is changed, and everything else is exactly the same (even the exceptions). 
+    With curl adapter, instead of copying and mimicking the <i>requests</i> library API, the low level HTTP adapter is changed with a custom crafted one, and everything else is exactly the same (even the exceptions are mapped). 
 <br><br>
 With a single switch you can enable/disable curl for your requests, without needing to worry about changing the way you normally work with requests.
 <br><br>
@@ -63,7 +66,7 @@ with requests.Session() as s:
 ```
 
 ## More
-You can get extra information from curl response info:
+You can get extra information from the curl response info:
 ```python
 import requests
 from curl_adapter import PyCurlAdapter, CurlInfo
diff --git a/curl_adapter/base_adapter.py b/curl_adapter/base_adapter.py
@@ -347,7 +347,7 @@ def build_response(self, curl: typing.Union[curl_cffi.Curl, pycurl.Curl], res:Cu
 		response.encoding = get_encoding_from_headers(response.headers)
 		response.raw = res
 		
-		response.reason = parsed_headers["headers"]
+		response.reason = parsed_headers["reason"]
 
 		response.get_curl_info = get_curl_info
 
@@ -481,8 +481,6 @@ def set_curl_options(self,
 	
 		# files
 		#already handled
-		# multipart
-		#already handled
 
 		# auth
 		#already handled, it's just a header...
diff --git a/curl_adapter/curl_cffi.py b/curl_adapter/curl_cffi.py
@@ -47,6 +47,7 @@ def get_curl_info(self, curl: curl_cffi.Curl, option_code: int):
 			"""
 				Currently, curl_cfii doesn't work for retriving information like TOTAL_TIME_T, SPEED_DOWNLOAD_T,
 				because they haven't mapped the all option codes. (These options start at 0x600000 int64_t, but curl_cfii maps only up to 0x400000...)
+				I made a pull request to fix it: https://github.com/lexiforest/curl_cffi/pull/481 (but as of now it's not merged yet)
 			"""
 			c_value = ffi.new("int64_t*")
 			value = lib.curl_easy_getinfo(curl._curl, option_code, c_value)
@@ -59,7 +60,9 @@ def get_curl_info(self, curl: curl_cffi.Curl, option_code: int):
 
 	def set_ja3_options(self, curl: curl_cffi.Curl, ja3: str, permute: bool = False):
 		"""
-		Detailed explanation: https://engineering.salesforce.com/tls-fingerprinting-with-ja3-and-ja3s-247362855967/
+			function sourced from: https://github.com/lexiforest/curl_cffi/blob/main/curl_cffi/requests/utils.py
+
+			Detailed explanation: https://engineering.salesforce.com/tls-fingerprinting-with-ja3-and-ja3s-247362855967/
 		"""
 
 		def toggle_extensions_by_ids(curl: curl_cffi.Curl, extension_ids):
@@ -128,7 +131,9 @@ def toggle_extensions_by_ids(curl: curl_cffi.Curl, extension_ids):
 
 	def set_akamai_options(self, curl: curl_cffi.Curl, akamai: str):
 		"""
-		Detailed explanation: https://www.blackhat.com/docs/eu-17/materials/eu-17-Shuster-Passive-Fingerprinting-Of-HTTP2-Clients-wp.pdf
+			function sourced from: https://github.com/lexiforest/curl_cffi/blob/main/curl_cffi/requests/utils.py
+
+			Detailed explanation: https://www.blackhat.com/docs/eu-17/materials/eu-17-Shuster-Passive-Fingerprinting-Of-HTTP2-Clients-wp.pdf
 		"""
 		settings, window_update, streams, header_order = akamai.split("|")
 		
diff --git a/curl_adapter/pycurl.py b/curl_adapter/pycurl.py
@@ -42,7 +42,8 @@ def set_curl_options(self, curl, request, url, timeout, proxies):
 		super().set_curl_options(curl, request, url, timeout, proxies)
 		
 		if self.use_curl_content_decoding:
-			# For some reason pycurl content decoding can only be enabled like this:
-			curl.setopt(pycurl.HTTP_CONTENT_DECODING, 0)
+			# It's better to use the urllib3 content decoding instead of letting PyCurl, because it's limited.
+			
+			# curl.setopt(pycurl.HTTP_CONTENT_DECODING, 0). There was a time when it needed to disable this in order for it's own encoding to work, weirdly. Though now it doesn't seem neccessary?
 			curl.setopt(pycurl.ENCODING, "gzip, deflate") #br, zstd not supported...
-			# Seems it better to use the urllib3 content decoding instead of automatic
+			
diff --git a/curl_adapter/stream/handler.py b/curl_adapter/stream/handler.py
@@ -8,9 +8,16 @@
 from curl_cffi.curl import CurlOpt, CurlError
 
 class CurlStreamHandler():
+	"""
+		Curl Stream Handler
+
+		:copyright: (c) 2025 by Elis K.
+	"""
+
+
 	def __init__(self, curl_instance: typing.Union[curl_cffi.Curl, pycurl.Curl], executor: ThreadPoolExecutor=None, callback_after_perform=None):
 		'''
-		    Initialize the stream handler.
+			Initialize the stream handler.
 		'''
 		self.curl = curl_instance
 		self.executor = executor or ThreadPoolExecutor()
@@ -23,21 +30,24 @@ def __init__(self, curl_instance: typing.Union[curl_cffi.Curl, pycurl.Curl], exe
 		self.allow_cleanup = threading.Event()
 		self.perform_finished = threading.Event()
 		self.callback_after_perform = callback_after_perform
-		
+		self._leftover = bytearray()  # buffer for leftover data when chunk > requested
+
 	def _write_callback(self, chunk):
 		'''
-		    Callback to handle incoming data chunks.
+			Callback to handle incoming data chunks.
 		'''
 		if not self.initialized.is_set():
 			self.initialized.set()
 		if self.quit_event.is_set():
 			return -1  # Signal to stop
 
-		self.chunk_queue.put_nowait(chunk)  # Add chunk to the queue
+		self.chunk_queue.put(chunk)  # Add chunk to the queue
 		return len(chunk)
 
 	def _download(self):
 
+		# Possible to set buffer size here
+		# self.curl.setopt(CurlOpt.BUFFERSIZE, 8 * 1024)
 		self.curl.setopt(CurlOpt.WRITEFUNCTION, self._write_callback)
 
 		try:
@@ -47,14 +57,15 @@ def _download(self):
 		finally:
 			self.chunk_queue.put(None)  # End of stream
 
+			if self.callback_after_perform and callable(self.callback_after_perform):
+				self.callback_after_perform()
+			
 			self.perform_finished.set()
 
 			# Set to avoid blocking 
 			if not self.initialized.is_set():
 				self.initialized.set()
 
-			if self.callback_after_perform and callable(self.callback_after_perform):
-				self.callback_after_perform()
 			
 	def start(self):
 		self._future = self.executor.submit(self._download)
@@ -74,35 +85,110 @@ def set_headers_parsed(self):
 		return self.allow_cleanup.set()
 
 	def read(self, amt=None):
-		'''
-			Read data from the queue in chunks. Returns a single chunk or all available data if amt is None.
-		'''
+		"""
+			A more 'file-like' read from the queue:
+
+			- If `amt` is None, read all.
+			- If `amt` is an integer, read exactly `amt` bytes.
+			- Handles leftover data from previous chunk to avoid losing bytes.
+		"""
+		if self.closed:
+			return b""
+
+		if self.error:
+			raise self.error
+
+		# If amt is None, read everything:
 		if amt is None:
-			data = []
-			while True:
-				if self.error:
-					raise self.error
-				try:
-					chunk = self.chunk_queue.get(timeout=1)
-					if chunk is None:  # End of stream
-						break
-					data.append(chunk)
-				except queue.Empty:
-					if self.quit_event.is_set():
-						break
-			return b"".join(data)
-		else:
+			return self._read_all()
+
+		# If amt is specified (and possibly 0 or > 0)
+		return self._read_amt(amt)
+
+	def _read_all(self):
+		"""
+			Read *all* remaining data from leftover + queue
+		"""
+		out = bytearray()
+
+		# If there's leftover data, use it first
+		out.extend(self._leftover)
+		self._leftover.clear()
+
+		# Then read new chunks until we hit None or are closed
+		while not self.closed:
+			if self.error:
+				raise self.error
+
+			try:
+				chunk = self.chunk_queue.get(timeout=1)
+			except queue.Empty:
+				# No data currently available
+				break
+			
+			if chunk is None:
+				# End of stream. Close here?
+				if self.perform_finished.is_set():
+					self.close()
+				break
+
+			out.extend(chunk)
+
+			if self.quit_event.is_set():
+				break
+
+		return bytes(out)
+
+	def _read_amt(self, amt):
+		"""
+			Read exactly `amt` bytes. Returns up to `amt`.
+		"""
+		out = bytearray()
+		needed = amt
+
+		# First, consume leftover if available
+		if self._leftover:
+			take = min(needed, len(self._leftover))
+			out.extend(self._leftover[:take])
+			del self._leftover[:take]
+			needed -= take
+
+		# Read additional chunks from the queue if we still need data
+		while needed > 0 and not self.closed:
 			if self.error:
 				raise self.error
+
 			try:
 				chunk = self.chunk_queue.get(timeout=1)
-				if chunk is None:  # End of stream
-					return b""
-				return chunk[:amt]
 			except queue.Empty:
-				return b""
+				# Temporarily no data
+				break
+
+			if chunk is None:
+				# End of stream. close here?
+				if self.perform_finished.is_set():
+					self.close()
+				
+				break
+
+			# If the chunk is bigger than needed, take part of it
+			# and store the remainder in _leftover.
+			if len(chunk) > needed:
+				out.extend(chunk[:needed])
+				self._leftover.extend(chunk[needed:])
+				needed = 0
+			else:
+				# Chunk fits entirely
+				out.extend(chunk)
+				needed -= len(chunk)
+
+			if self.quit_event.is_set():
+				break
+
+		return bytes(out)
 
 	def flush(self):
+		#self._leftover.clear()
 		pass
 	
 	def close(self):
@@ -122,11 +208,10 @@ def close(self):
 		# self.curl.close()
 		self.allow_cleanup.wait(timeout=1)
 		self.curl.reset()
-		
-	
+			
 	def __del__(self):
 		'''
-		    Destructor to ensure the response is properly closed when garbage-collected.
+			Destructor to ensure the response is properly closed when garbage-collected.
 		'''
 		if not self.closed:
 			self.close()
diff --git a/curl_adapter/stream/response.py b/curl_adapter/stream/response.py
@@ -17,7 +17,7 @@
 
 class BytesQueueBuffer:
     """
-	Needed to support newer versions of urllib3
+	this class is sourced from urllib3 HTTPResponse. It's needed to support newer versions of urllib3
 	------------------------------------------
 	Memory-efficient bytes buffer
 
@@ -130,7 +130,7 @@ def __init__(
 		version=None, #HTTP Version header
 
 		preload_content=False,
-		enforce_content_length=False,
+		enforce_content_length=True,
 		auto_close=True,
 	):
 
@@ -155,6 +155,11 @@ def __init__(
 
 		self.decode_content = self._handle_content_decoding
 		self.enforce_content_length = enforce_content_length
+
+		if not self._handle_content_decoding:
+			# In cases when curl is handling content decoding, disable content length checks otherwise we might get unexcepted errors
+			self.enforce_content_length = False
+		
 		self.auto_close = auto_close
 
 		self._decoder = None
diff --git a/pyproject.toml b/pyproject.toml
diff --git a/tests/test_general.py b/tests/test_general.py

-Original file line number
+Diff line change
 *.pyd
 .venv
 *.egg-info
 -dist
 +dist
 +other