Source code for eucrim.issue.utils

# SPDX-FileCopyrightText: 2024 Thomas Breitner <t.breitner@csl.mpg.de>
#
# SPDX-License-Identifier: EUPL-1.2

import logging
import pathlib
import re
import os
from collections import namedtuple
import pdftotext
from pypdf import PdfReader
from wordcloud import WordCloud, STOPWORDS
from django.conf import settings


logger = logging.getLogger(__name__)


[docs] def canonical_issue_filename(issue_year, issue_number, filetype=None): return "{prefix}{year}-0{number}{delimiter}{filetype}".format( year=issue_year, number=issue_number, filetype=filetype if filetype else "", delimiter="." if filetype else "", prefix="eucrim_issue_" if filetype else "", )
PDFExtractResult = namedtuple( "PDFExtractResult", [ "title_en", "title_fr", "title_ge", "legacy_toc", "error", ], )
[docs] def extract_focuses_from_pdfmetadata(contentfile): pdf = PdfReader(contentfile) information = pdf.metadata # txt = f""" # Information about {contentfile}: # Author: {information.author} # Creator: {information.creator} # Producer: {information.producer} # Subject: {information.subject} # Title: {information.title} # Number of pages: {pdf.getNumPages()} # """ error = None try: title_en, title_fr, title_ge = information.subject.split(";") except AttributeError as e: error = f"No PDF meta subject found (should be: title_en;title_fr;title_ge)! Error message was: {e}" except ValueError as e: error = f"Not all issue titles (EN, FR, GE) found or unable to parse PDF meta subject (should be: title_en;title_fr;title_ge)! Error message was: {e}" return PDFExtractResult( title_en=title_en if not error else None, title_fr=title_fr if not error else None, title_ge=title_ge if not error else None, legacy_toc=None, error=error, )
[docs] def extract_focuses_from_pdfcoverpage(contentfile): text = pdftotext.PDF(contentfile) # only process the first page: cover_text = text[0] # ToDo: refactor this # extract multilang titles title_en = title_fr = title_ge = "" title_en_regex = re.compile(r"^Focus.*$", re.MULTILINE) title_fr_regex = re.compile(r"^Dossier particulier.*$", re.MULTILINE) title_ge_regex = re.compile(r"^Schwerpunktthema.*$", re.MULTILINE) title_fr_mo = title_fr_regex.search(cover_text) title_en_mo = title_en_regex.search(cover_text) title_ge_mo = title_ge_regex.search(cover_text) if title_en_mo is not None: title_en = title_en_mo.group() if title_fr_mo is not None: title_fr = title_fr_mo.group() if title_ge_mo is not None: title_ge = title_ge_mo.group() # extract toc from title page legacy_toc = "" toc_regex = re.compile(r"^Schwerpunktthema.*?$(.*)", flags=re.MULTILINE | re.DOTALL) toc_mo = toc_regex.search(cover_text) if toc_mo is not None: legacy_toc = toc_mo.group(1).replace("\n", "<br>") return PDFExtractResult( title_en=title_en, title_fr=title_fr, title_ge=title_ge, legacy_toc=legacy_toc, error=None, )
[docs] def extract_focuses(contentfile): """ Extracts the focus titles of an issue. Expects an instance of a `ContentFile` Sources for focuses (in that ordering): 1. pdf cover page (including toc) DISABLED 2. pdf metadata subject (no toc) """ # metadata = extract_focuses_from_pdfcoverpage(contentfile) # if not all([*metadata]): return extract_focuses_from_pdfmetadata(contentfile)
[docs] def get_wordcloud_dir(): # Todo: refactor get and set wordcloud base_path = pathlib.Path(settings.MEDIA_ROOT) wordcloud_path = base_path / "issue" / "wordcloud" wordcloud_path.mkdir(parents=True, exist_ok=True) return wordcloud_path
[docs] def generate_wordcloud(contentfile, filename, custom_stopwords=None): text = pdftotext.PDF(contentfile) text = " ".join(text) module_dir = os.path.dirname(__file__) # get current directory module_stopwords_de = set( map( str.strip, open( os.path.join(module_dir, "stopwords_de.txt"), encoding="utf-8" ).readlines(), ) ) # we are working with sets here: stopwords = STOPWORDS.union(module_stopwords_de, custom_stopwords) # stopwords.add("EU") # star_mask = np.array(Image.open(os.path.join(module_dir, "star.png"))) wordcloud = WordCloud( width=1000, height=1000, background_color="white", max_words=150, stopwords=stopwords, # mask=star_mask, ).generate(text) filename = "{}.png".format(filename) wordcloud_file = get_wordcloud_dir() / filename bitmap = wordcloud.to_image() bitmap.save(wordcloud_file, "PNG") return
[docs] def get_wordcloud_bitmap(instance): filename = "{}-0{}.png".format(instance.year, instance.issue_number) wordcloud_abs_path = get_wordcloud_dir() / filename wordcloud_rel_path = f"{wordcloud_abs_path.relative_to(settings.MEDIA_ROOT.parent)}" try: with wordcloud_abs_path.open(): # print("wordcloud file found") return wordcloud_rel_path except FileNotFoundError: logger.warning("Wordcloud file for issue {} does not exist.".format(instance))