From 9854382616eaeb2306c8f523541b9ec1276c97e6 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 9 Jun 2026 11:47:16 +0200 Subject: [PATCH 1/3] refactor(scrapy): make AsyncThread timeout configurable --- src/apify/scrapy/_async_thread.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/src/apify/scrapy/_async_thread.py b/src/apify/scrapy/_async_thread.py index 79de1162d..16665b6f2 100644 --- a/src/apify/scrapy/_async_thread.py +++ b/src/apify/scrapy/_async_thread.py @@ -14,13 +14,18 @@ class AsyncThread: - """Class for running an asyncio event loop in a separate thread. - - This allows running asynchronous coroutines from synchronous code by executingthem on an event loop - that runs in its own dedicated thread. + """Run an asyncio event loop in a dedicated background thread. + + This lets synchronous Scrapy callbacks drive asynchronous Apify and Crawlee coroutines. Each + consumer (the scheduler and the HTTP cache storage) owns its own `AsyncThread`, so the request + queue and the key-value store each live entirely on a single, separate event loop and are never + shared across loops. They do read the same global `Configuration`, which is read-only here, so + the isolation holds. A single shared loop would also work but would couple the otherwise + independent lifecycles of those Scrapy components. """ - def __init__(self) -> None: + def __init__(self, default_timeout: timedelta = timedelta(seconds=60)) -> None: + self._default_timeout = default_timeout self._eventloop = asyncio.new_event_loop() # Start the event loop in a dedicated daemon thread. @@ -33,7 +38,7 @@ def __init__(self) -> None: def run_coro( self, coro: Coroutine, - timeout: timedelta = timedelta(seconds=60), + timeout: timedelta | None = None, ) -> Any: """Run a coroutine on an event loop running in a separate thread. @@ -42,7 +47,8 @@ def run_coro( Args: coro: The coroutine to run. - timeout: The maximum number of seconds to wait for the coroutine to finish. + timeout: The maximum time to wait for the coroutine to finish. Defaults to the + `default_timeout` passed to the constructor. Returns: The result returned by the coroutine. @@ -52,6 +58,9 @@ def run_coro( TimeoutError: If the coroutine does not complete within the timeout. Exception: Any exception raised during coroutine execution. """ + if timeout is None: + timeout = self._default_timeout + if not self._eventloop.is_running(): raise RuntimeError(f'The coroutine {coro} cannot be executed because the event loop is not running.') From df26807195b1c0b4663006fc0b8b2a8466d4698f Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 9 Jun 2026 12:56:16 +0200 Subject: [PATCH 2/3] style(scrapy): tighten comments and docstrings --- src/apify/scrapy/_async_thread.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/apify/scrapy/_async_thread.py b/src/apify/scrapy/_async_thread.py index 16665b6f2..946b8b95a 100644 --- a/src/apify/scrapy/_async_thread.py +++ b/src/apify/scrapy/_async_thread.py @@ -16,12 +16,10 @@ class AsyncThread: """Run an asyncio event loop in a dedicated background thread. - This lets synchronous Scrapy callbacks drive asynchronous Apify and Crawlee coroutines. Each - consumer (the scheduler and the HTTP cache storage) owns its own `AsyncThread`, so the request - queue and the key-value store each live entirely on a single, separate event loop and are never - shared across loops. They do read the same global `Configuration`, which is read-only here, so - the isolation holds. A single shared loop would also work but would couple the otherwise - independent lifecycles of those Scrapy components. + This lets synchronous Scrapy callbacks drive asynchronous Apify and Crawlee coroutines. The + scheduler and the HTTP cache storage each own their own `AsyncThread`, so the request queue and + the key-value store never share an event loop; they only share the read-only global + `Configuration`. A single shared loop would also work but would couple their lifecycles. """ def __init__(self, default_timeout: timedelta = timedelta(seconds=60)) -> None: From 85237fa9828d480096ec8add9e8fa85b5c552ba8 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 9 Jun 2026 14:38:09 +0200 Subject: [PATCH 3/3] refactor(scrapy): use 'default' sentinel for run_coro timeout --- src/apify/scrapy/_async_thread.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/apify/scrapy/_async_thread.py b/src/apify/scrapy/_async_thread.py index 946b8b95a..0333531b5 100644 --- a/src/apify/scrapy/_async_thread.py +++ b/src/apify/scrapy/_async_thread.py @@ -5,7 +5,7 @@ from concurrent import futures from datetime import timedelta from logging import getLogger -from typing import TYPE_CHECKING, Any +from typing import TYPE_CHECKING, Any, Literal if TYPE_CHECKING: from collections.abc import Coroutine @@ -36,7 +36,7 @@ def __init__(self, default_timeout: timedelta = timedelta(seconds=60)) -> None: def run_coro( self, coro: Coroutine, - timeout: timedelta | None = None, + timeout: timedelta | Literal['default'] = 'default', ) -> Any: """Run a coroutine on an event loop running in a separate thread. @@ -45,7 +45,7 @@ def run_coro( Args: coro: The coroutine to run. - timeout: The maximum time to wait for the coroutine to finish. Defaults to the + timeout: The maximum time to wait for the coroutine to finish. Pass `'default'` to use the `default_timeout` passed to the constructor. Returns: @@ -56,7 +56,7 @@ def run_coro( TimeoutError: If the coroutine does not complete within the timeout. Exception: Any exception raised during coroutine execution. """ - if timeout is None: + if timeout == 'default': timeout = self._default_timeout if not self._eventloop.is_running():