Skip to content

API

consumo

Content Consumption Analyzer.

ConsumoError

Bases: Exception

Base class for consumo exceptions.

Source code in src/consumo/lib/exceptions.py
class ConsumoError(Exception):
    """Base class for consumo exceptions."""

MissingMetadataError

Bases: ConsumoError

Raised when a backend can't get the duration of a file from its metadata.

Source code in src/consumo/lib/exceptions.py
class MissingMetadataError(ConsumoError):
    """Raised when a backend can't get the duration of a file from its metadata."""

calculate_html_consumption_time(html, words_per_minute=265, multimedia_duration_resolver=None)

Calculate the consumption time of an HTML file in seconds.

Uses concurrency to get the duration of any multimedia in the file to avoid any possible throttling.

Parameters:

Name Type Description Default
html FilePath

Path to the HTML file whose consumption time will be calculated.

required
words_per_minute NonNegativeInt

Reading speed in words per minute.

265
multimedia_duration_resolver

Function used to get the duration of a multimedia file.

None

Returns:

Type Description
int

The time in seconds to consume the content of the HTML file.

Source code in src/consumo/lib/file/html.py
@validate_call
def calculate_consumption_time(
    html: FilePath,
    words_per_minute: NonNegativeInt = 265,
    multimedia_duration_resolver=None,  # noqa: ANN001 (unsuppored by Typer)
) -> int:
    """Calculate the consumption time of an HTML file in seconds.

    Uses concurrency to get the duration of any multimedia in the file to avoid any
    possible throttling.

    Args:
        html: Path to the HTML file whose consumption time will be calculated.
        words_per_minute: Reading speed in words per minute.
        multimedia_duration_resolver: Function used to get the duration of a
            multimedia file.

    Returns:
        The time in seconds to consume the content of the HTML file.
    """
    # This is the default multimedia duration resolver. It can't be set as the
    # default in the function parameters because it seems like you can't reuse
    # function parameters in Python.
    if multimedia_duration_resolver is None:

        def multimedia_duration_resolver(src: str) -> int:
            return get_multimedia_duration(html, src)

    raw_html: str = html.read_text("utf-8")
    soup: BeautifulSoup = BeautifulSoup(raw_html, "lxml")
    text: str | None = trafilatura.extract(raw_html)
    word_count, cjk_character_count = get_word_count(text or "")
    reading_time: int = calculate_reading_time(
        word_count, cjk_character_count, words_per_minute
    )

    image_count: int = len(soup("img"))
    image_time: int = calculate_viewing_time(image_count)

    multimedias: list[str] = extract_multimedias(soup)
    multimedia_time: int = 0

    with ThreadPoolExecutor() as e:
        resolved_durations: Iterator[int] = e.map(
            multimedia_duration_resolver, multimedias
        )

        multimedia_time += sum(resolved_durations)

    multimedia_time += get_custom_player_duration(html)

    return reading_time + image_time + multimedia_time

calculate_mass_media_consumption_time(container, words_per_minute=265)

Calculate the consumption time of a text container file in seconds.

Parameters:

Name Type Description Default
container FilePath

Path to a file primarily meant for text. Supported types are EPUB, MOBI and PDF.

required
words_per_minute NonNegativeInt

Reading speed in words per minute.

265

Returns:

Type Description
int

The time in seconds to consume the content of the file.

Source code in src/consumo/lib/file/mass_media.py
@validate_call
def calculate_consumption_time(
    container: FilePath, words_per_minute: NonNegativeInt = 265
) -> int:
    """Calculate the consumption time of a text container file in seconds.

    Args:
        container: Path to a file primarily meant for text. Supported types
            are EPUB, MOBI and PDF.
        words_per_minute: Reading speed in words per minute.

    Returns:
        The time in seconds to consume the content of the file.
    """
    text: str = extract_text(container)
    word_count, cjk_character_count = get_word_count(text)
    reading_time: int = calculate_reading_time(
        word_count, cjk_character_count, words_per_minute
    )

    return reading_time

calculate_reading_time(word_count, cjk_character_count, words_per_minute=265)

Calculate the reading time in seconds based on word count.

Supports Chinese, Japanese, and Korean (CJK) by having its reading speed as 1.8867924528 (500 / 265) that of the word one. This is done because, in the Medium formula, the average reading speed for words is 265 per minute, while the average for non-alphabetical languages is 500 per character.

Parameters:

Name Type Description Default
word_count int

The number of words in the text.

required
cjk_character_count int

The number of CJK characters in the text.

required
words_per_minute NonNegativeInt

Reading speed in words per minute.

265

Returns:

Type Description
int

How long in seconds it would take to read the text.

Source code in src/consumo/lib/file/text.py
@validate_call
def calculate_reading_time(
    word_count: int, cjk_character_count: int, words_per_minute: NonNegativeInt = 265
) -> int:
    """Calculate the reading time in seconds based on word count.

    Supports Chinese, Japanese, and Korean (CJK) by having its reading speed as
    1.8867924528 (500 / 265) that of the word one. This is done because, in the Medium
    formula, the average reading speed for words is 265 per minute, while the
    average for non-alphabetical languages is 500 per character.

    Args:
        word_count: The number of words in the text.
        cjk_character_count: The number of CJK characters in the text.
        words_per_minute: Reading speed in words per minute.

    Returns:
        How long in seconds it would take to read the text.
    """
    minutes_to_seconds: int = 60
    word_reading_time: int | float = (
        word_count / words_per_minute
    ) * minutes_to_seconds
    # 500 / 265 \approx 1.8867924528301887.
    cjk_reading_rate: int | float = words_per_minute * 1.8867924528301887
    cjk_reading_time: int | float = (
        cjk_character_count / cjk_reading_rate
    ) * minutes_to_seconds
    raw_reading_time: int | float = word_reading_time + cjk_reading_time
    reading_time: int = math.ceil(raw_reading_time)

    return reading_time

calculate_text_consumption_time(container, words_per_minute=265)

Calculate the consumption time of a plain text file in seconds.

Parameters:

Name Type Description Default
container FilePath

Path to the plain text file whose consumption time will be calculated.

required
words_per_minute NonNegativeInt

Reading speed in words per minute.

265

Returns:

Type Description
int

The time in seconds to consume the content in the plain text file.

Source code in src/consumo/lib/file/text.py
def calculate_consumption_time(
    container: FilePath, words_per_minute: NonNegativeInt = 265
) -> int:
    """Calculate the consumption time of a plain text file in seconds.

    Args:
        container: Path to the plain text file whose consumption time will be
            calculated.
        words_per_minute: Reading speed in words per minute.

    Returns:
        The time in seconds to consume the content in the plain text file.
    """
    text: str = container.read_text("utf-8")
    word_count, cjk_character_count = get_word_count(text)

    return calculate_reading_time(word_count, cjk_character_count, words_per_minute)

calculate_url_consumption_time(url, words_per_minute=265)

Calculate the consumption time of a URL in seconds.

Avoids code duplication by downloading the HTML of the URL to a temporary file, to use the HTML backend calculate_html_consumption_time.

Parameters:

Name Type Description Default
url HttpUrl

URL pointing to the content whose consumption time will be analyzed.

required
words_per_minute NonNegativeInt

Reading speed in words per minute.

265

Returns:

Type Description
int

The time in seconds to consume the content the URL points to.

Raises:

Type Description
ConnectionError

If the HTML content of the URL wasn't downloaded.

Source code in src/consumo/lib/url.py
@validate_call
def calculate_consumption_time(
    url: HttpUrl, words_per_minute: NonNegativeInt = 265
) -> int:
    """Calculate the consumption time of a URL in seconds.

    Avoids code duplication by downloading the HTML of the URL to a temporary
    file, to use the HTML backend [`calculate_html_consumption_time`][consumo.calculate_html_consumption_time].

    Args:
        url: URL pointing to the content whose consumption time will be analyzed.
        words_per_minute: Reading speed in words per minute.

    Returns:
        The time in seconds to consume the content the URL points to.

    Raises:
        ConnectionError: If the HTML content of the URL wasn't downloaded.
    """
    html_content: str | None = trafilatura.fetch_url(str(url))

    if html_content is None:
        raise ConnectionError

    with TemporaryDirectory() as tmp_dir:
        html: Path = Path(tmp_dir) / "temp.html"

        html.write_text(html_content, "utf-8")

        return calculate_html_consumption_time(
            html, words_per_minute, lambda src: get_multimedia_duration(url, src)
        )

calculate_viewing_time(image_count)

Calculate the time for viewing images based on count.

Parameters:

Name Type Description Default
image_count NonNegativeInt

The number of images.

required

Returns:

Type Description
int

The time in seconds to view all the images.

Source code in src/consumo/lib/file/image.py
@validate_call
def calculate_viewing_time(image_count: NonNegativeInt) -> int:
    """Calculate the time for viewing images based on count.

    Args:
        image_count: The number of images.

    Returns:
        The time in seconds to view all the images.
    """
    first_ten_images_time: int = 75

    if image_count > 10:
        after_ten_images_time: int = (image_count - 10) * 3

        return first_ten_images_time + after_ten_images_time

    # For the first ten images, the time follows an arithmetic progression of:
    # S_{n} = \frac{n(25 - n)}{2}
    image_time: int | float = (image_count * (25 - image_count)) / 2

    return math.ceil(image_time)

extract_mass_media_text(container)

Extract text from a text container file.

Parameters:

Name Type Description Default
container FilePath

Path to a file primarily meant for text. Supported types are EPUB, MOBI and PDF.

required

Returns:

Type Description
str

All the text content in the container.

Source code in src/consumo/lib/file/mass_media.py
@validate_call
def extract_text(container: FilePath) -> str:
    """Extract text from a text container file.

    Args:
        container: Path to a file primarily meant for text. Supported types
            are EPUB, MOBI and PDF.

    Returns:
        All the text content in the container.
    """
    with pymupdf.open(container) as c:
        raw_text: str = " ".join(page.get_text() for page in c)

        return raw_text.strip()

extract_multimedias(soup)

Get all the multimedia sources from an HTML file.

Parameters:

Name Type Description Default
soup BeautifulSoup

The HTML file as parsed by BeautifulSoup.

required

Returns:

Type Description
list[str]

A list of all the multimedia sources.

Source code in src/consumo/lib/file/html.py
def extract_multimedias(soup: BeautifulSoup) -> list[str]:
    """Get all the multimedia sources from an HTML file.

    Args:
        soup: The HTML file as parsed by BeautifulSoup.

    Returns:
        A list of all the multimedia sources.
    """
    audios: ResultSet[Tag] = soup("audio")
    iframes: ResultSet[Tag] = soup("iframe")
    videos: ResultSet[Tag] = soup("video")
    result: list[str] = []

    for audio in audios:
        primary_source: Tag = audio("source")[0]
        src: str | AttributeValueList | None = primary_source.get("src")

        result.append(str(src))

    for iframe in iframes:
        src: str | AttributeValueList | None = iframe.get("src")

        result.append(str(src))

    for video in videos:
        primary_source: Tag = video("source")[0]
        src: str | AttributeValueList | None = primary_source.get("src")

        result.append(str(src))

    return result

format_time(total_seconds)

Format the duration/consumption time given in seconds in a *h *m *s format.

Parameters:

Name Type Description Default
total_seconds int

The duration/consumption time in seconds of the content.

required

Returns:

Type Description
str

The duration/consumption time in a *h *m *s format.

Source code in src/consumo/lib/formatting.py
def format_time(total_seconds: int) -> str:
    """Format the duration/consumption time given in seconds in a *h *m *s format.

    Args:
        total_seconds: The duration/consumption time in seconds of the content.

    Returns:
        The duration/consumption time in a *h *m *s format.
    """
    minutes, seconds = divmod(total_seconds, 60)
    hours, minutes = divmod(minutes, 60)
    hours %= 24

    parts: list[str] = []

    if hours:
        parts.append(f"{hours}h")
    if minutes:
        parts.append(f"{minutes}m")

    parts.append(f"{seconds}s")

    return " ".join(parts)

get_custom_player_duration(html)

Parse the JSON data in an HTML file provided for SEO to get video duration.

Designed with videos using custom players like the BBC's smp-toucan-player in mind.

The supported format for duration is ISO 8601.

Parameters:

Name Type Description Default
html FilePath

Path to the HTML file whose content will be parsed for JSON data containing duration information.

required

Returns:

Type Description
int

The duration reported by the JSON data as an integer representing

int

seconds.

Source code in src/consumo/lib/file/html.py
@validate_call
def get_custom_player_duration(html: FilePath) -> int:
    """Parse the JSON data in an HTML file provided for SEO to get video duration.

    Designed with videos using custom players like the BBC's smp-toucan-player
    in mind.

    The supported format for duration is ISO 8601.

    Args:
        html: Path to the HTML file whose content will be parsed for JSON data
            containing duration information.

    Returns:
        The duration reported by the JSON data as an integer representing
        seconds.
    """
    raw_html: str = html.read_text("utf-8")
    soup: BeautifulSoup = BeautifulSoup(raw_html, "lxml")
    script_attrs = {"data-schema": "video-object"}

    found_script_tags: ResultSet[Tag] = soup("script", script_attrs)
    total_seconds: int = 0

    for tag in found_script_tags:
        if not tag.string:
            continue

        try:
            data: Any = json.loads(tag.string)
            duration_str: str | None = data.get("duration")

            if duration_str:
                duration: timedelta = DURATION_ADAPTER.validate_python(duration_str)

                total_seconds += int(duration.total_seconds())

        except Exception:
            continue

    return total_seconds

get_html_multimedia_duration(html, src)

Get the duration of a multimedia file in an HTML file.

Tries to treat the multimedia file as if it was hosted online, then tries to resolve its path if that fails.

Parameters:

Name Type Description Default
html FilePath

Path to the HTML file where the multimedia file was found.

required
src str

Path used for the file's "src" attribute.

required

Returns:

Type Description
int

The duration of the content in seconds.

Source code in src/consumo/lib/file/html.py
@validate_call
def get_multimedia_duration(html: FilePath, src: str) -> int:
    """Get the duration of a multimedia file in an HTML file.

    Tries to treat the multimedia file as if it was hosted online, then tries to
    resolve its path if that fails.

    Args:
        html: Path to the HTML file where the multimedia file was found.
        src: Path used for the file's "src" attribute.

    Returns:
        The duration of the content in seconds.
    """
    try:
        return get_absolute_path_multimedia_duration(HttpUrl(src))
    except ValidationError:
        # If HttpUrl(src) fails validation, the src is likely a relative
        # path rather than a URL.
        return get_relative_path_multimedia_duration(html, Path(src))

get_multimedia_duration(url)

Get the duration of a multimedia container hosted online.

Tries to treat the URL as if it was from a hosting platform, then tries to get the duration from the container as if the URL pointed directly to it if that fails.

Parameters:

Name Type Description Default
url HttpUrl

URL pointing to where the multimedia container is hosted.

required

Returns:

Type Description
int

The duration of the content in seconds.

Source code in src/consumo/lib/file/multimedia.py
def get_duration(url: HttpUrl) -> int:
    """Get the duration of a multimedia container hosted online.

    Tries to treat the URL as if it was from a hosting platform, then tries to
    get the duration from the container as if the URL pointed directly to it if
    that fails.

    Args:
        url: URL pointing to where the multimedia container is hosted.

    Returns:
        The duration of the content in seconds.
    """
    try:
        duration: int = get_hosted_multimedia_duration(url)
    except (DownloadError, MissingMetadataError):
        duration: int = get_multimedia_duration(url)

    return duration

get_url_multimedia_duration(url, src)

Get the duration of a multimedia hosted online.

Tries to treat the file as if it had an absolute path, then tries to resolve its path if that fails.

Parameters:

Name Type Description Default
url HttpUrl

URL where the multimedia file was originally found for path resolution.

required
src str

Path used for the multimedia file's "src" attribute.

required

Returns:

Type Description
int

The duration of the content in seconds.

Source code in src/consumo/lib/url.py
@validate_call
def get_multimedia_duration(url: HttpUrl, src: str) -> int:
    """Get the duration of a multimedia hosted online.

    Tries to treat the file as if it had an absolute path, then tries to
    resolve its path if that fails.

    Args:
        url: URL where the multimedia file was originally found for
            path resolution.
        src: Path used for the multimedia file's "src" attribute.

    Returns:
        The duration of the content in seconds.
    """
    try:
        return get_absolute_path_multimedia_duration(HttpUrl(src))
    except ValidationError:
        # If HttpUrl(src) fails validation, then src is likely a relative
        # path rather than a URL.
        return get_relative_path_multimedia_duration(url, Path(src))

get_word_count(text)

Get the number of words from text.

Parameters:

Name Type Description Default
text str

Text where the number of words will be counted from.

required

Returns:

Type Description
tuple[int, int]

The number of words in the text.

Source code in src/consumo/lib/file/text.py
def get_word_count(text: str) -> tuple[int, int]:
    """Get the number of words from text.

    Args:
        text: Text where the number of words will be counted from.

    Returns:
        The number of words in the text.
    """
    word_count: float = len(text.split())
    cjk = regex.compile(r"\p{Script=Han}|\p{Hiragana}|\p{Katakana}|\p{Script=Hangul}")
    cjk_character_count: int = len(cjk.findall(text))

    return word_count, cjk_character_count