Examples

This page contains some more advanced and realistic examples of using Unparallel.

Query (all) posts of a WordPress site

"""
This script uses the WordPress API to query (all) posts of the website
https://techcrunch.com

See Also:
    https://developer.wordpress.org/rest-api/reference/posts/
"""

import asyncio
from pprint import pp

import httpx

from unparallel import up
from unparallel.unparallel import RequestError


async def main():
    page_size = 20
    base_url = "https://techcrunch.com/wp-json/wp/v2"
    pagination_url = f"/posts?per_page={page_size}"

    # Get page count
    response = httpx.head(base_url + pagination_url)
    total_pages = int(response.headers["X-WP-TotalPages"])
    print(f"Website '{base_url}' has {total_pages} pages (page size = {page_size})")

    # Comment the line below to get all pages. Note that you might have to adjust
    # the settings for this to work without errors. For me, it worked using
    # max_connections=800 and timeout=60.

    total_pages = min(total_pages, 100)

    # Get all pages and flatten the result
    paths = [f"{pagination_url}&page={i}" for i in range(1, total_pages + 1)]
    results = await up(paths, method="GET", base_url=base_url, flatten_result=True)

    # Check if some requests failed
    valid_results = [item for item in results if not isinstance(item, RequestError)]
    fails = len(results) - len(valid_results)
    print(f"{fails=} ({fails / len(results):.2%})")

    # Display some properties of the first 5 posts
    intersting_keys = ["id", "date", "slug", "title", "author"]
    pp(
        [
            {k: v for k, v in item.items() if k in intersting_keys}
            for item in valid_results[:5]
        ]
    )


if __name__ == "__main__":
    asyncio.run(main())

If you run the example, it should print something like the following:

Website 'https://techcrunch.com/wp-json/wp/v2' has 12202 pages (page size = 20)
Making async requests: 100%|███████████| 1000/1000 [00:13<00:00, 74.30it/s]
[{'id': 2642913,
  'date': '2023-12-26T07:05:21',
  'slug': 'vcs-are-entering-2024-with-healthy-paranoia',
  'title': {'rendered': 'VCs are entering 2024 with &#8216;healthy '
                        'paranoia&#8217;'},
  'author': 428363},
 {'id': 2645233,
  'date': '2023-12-26T06:35:00',
  'slug': 'what-vcs-are-looking-for-in-the-next-wave-of-cybersecurity-startups',
  'title': {'rendered': 'What VCs are looking for in the next wave of '
                        'cybersecurity startups'},
  'author': 133574551},
 {'id': 2641499,
  'date': '2023-12-26T06:05:55',
  'slug': 'hackers-stole-2-billion-in-crypto-in-2023-data-shows',
  'title': {'rendered': 'Hackers stole $2 billion in crypto in 2023, data '
                        'shows'},
  'author': 133574594},
 {'id': 2635851,
  'date': '2023-12-26T05:05:28',
  'slug': 'the-eternal-struggle-between-open-source-and-proprietary-software',
  'title': {'rendered': 'The eternal struggle between open source and '
                        'proprietary software'},
  'author': 133574560},
 {'id': 2645355,
  'date': '2023-12-26T03:52:55',
  'slug': 'nonprofit-code-org-sues-byjus-unit-whitehat-jr-over-payment-dues',
  'title': {'rendered': 'Nonprofit Code.org sues Byju&#8217;s unit WhiteHat Jr '
                        'over payment dues'},
  'author': 133574269}]

Fetch the content of multiple websites

"""
This script is based on a use case at StackOverflow and fetches the content of various
webpages.

See Also:
    https://www.stackoverflow.com/a/57129241
"""

import asyncio

from unparallel import up

# Extracted from https://en.wikipedia.org/wiki/List_of_most-visited_websites
websites = """https://www.google.com/
https://www.youtube.com/
https://www.facebook.com/
https://www.wikipedia.org/
https://www.instagram.com/
https://www.reddit.com/
https://www.amazon.com/
https://www.duckduckgo.com/
https://www.yahoo.com/
https://www.tiktok.com/
https://www.bing.com/
https://www.yahoo.co.jp/
https://www.weather.com/
https://www.whatsapp.com/
https://www.yandex.ru/
https://www.openai.com/
https://www.live.com/
https://www.microsoft.com/
https://www.linkedin.com/
https://www.quora.com/
https://www.twitch.tv/
https://www.naver.com/
https://www.netflix.com/
https://www.office.com/
https://www.vk.com/
https://www.globo.com/
https://www.Aliexpress.com/
https://www.cnn.com/
https://www.zoom.us/
https://www.imdb.com/
https://www.x.com/
https://www.nytimes.com/
https://www.espn.com/
https://www.amazon.co.jp/
https://www.pinterest.com/
https://www.uol.com.br/
https://www.ebay.com/
https://www.marca.com/
https://www.canva.com/
https://www.spotify.com/
https://www.bbc.com/
https://www.paypal.com/
https://www.apple.com/"""


async def main():
    urls = websites.split("\n")

    # Get all pages
    results = await up(
        urls, method="GET", response_fn=lambda x: x.text, raise_for_status=False
    )

    # Print the content of the first 5 pages
    for url, content in zip(urls, results[:5]):
        print(url, repr(content[:100]))


if __name__ == "__main__":
    asyncio.run(main())

If you run the example, it should print something like the following:

Making async requests: 100%|███████████| 43/43 [00:03<00:00, 11.60it/s]
https://www.google.com/ '<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="de-AT"><head><meta cont'
https://www.youtube.com/ '<!DOCTYPE html><html style="font-size: 10px;font-family: Roboto, Arial, sans-serif;" lang="de-DE" da'
https://www.facebook.com/ '<!DOCTYPE html>\n<html lang="de" id="facebook" class="no_js">\n<head><meta charset="utf-8" /><meta nam'
https://www.wikipedia.org/ '<!DOCTYPE html>\n<html lang="en" class="no-js">\n<head>\n<meta charset="utf-8">\n<title>Wikipedia</title'
https://www.instagram.com/ '<!DOCTYPE html><html class="_9dls" lang="en" dir="ltr"><head><link data-default-icon="https://static'