Source code for eucrim.issue.utils
# SPDX-FileCopyrightText: 2024 Thomas Breitner <t.breitner@csl.mpg.de>
#
# SPDX-License-Identifier: EUPL-1.2
import logging
import pathlib
import re
import os
from collections import namedtuple
import pdftotext
from pypdf import PdfReader
from wordcloud import WordCloud, STOPWORDS
from django.conf import settings
logger = logging.getLogger(__name__)
[docs]
def canonical_issue_filename(issue_year, issue_number, filetype=None):
return "{prefix}{year}-0{number}{delimiter}{filetype}".format(
year=issue_year,
number=issue_number,
filetype=filetype if filetype else "",
delimiter="." if filetype else "",
prefix="eucrim_issue_" if filetype else "",
)
PDFExtractResult = namedtuple(
"PDFExtractResult",
[
"title_en",
"title_fr",
"title_ge",
"legacy_toc",
"error",
],
)
[docs]
def extract_focuses_from_pdfmetadata(contentfile):
pdf = PdfReader(contentfile)
information = pdf.metadata
# txt = f"""
# Information about {contentfile}:
# Author: {information.author}
# Creator: {information.creator}
# Producer: {information.producer}
# Subject: {information.subject}
# Title: {information.title}
# Number of pages: {pdf.getNumPages()}
# """
error = None
try:
title_en, title_fr, title_ge = information.subject.split(";")
except AttributeError as e:
error = f"No PDF meta subject found (should be: title_en;title_fr;title_ge)! Error message was: {e}"
except ValueError as e:
error = f"Not all issue titles (EN, FR, GE) found or unable to parse PDF meta subject (should be: title_en;title_fr;title_ge)! Error message was: {e}"
return PDFExtractResult(
title_en=title_en if not error else None,
title_fr=title_fr if not error else None,
title_ge=title_ge if not error else None,
legacy_toc=None,
error=error,
)
[docs]
def extract_focuses_from_pdfcoverpage(contentfile):
text = pdftotext.PDF(contentfile)
# only process the first page:
cover_text = text[0]
# ToDo: refactor this
# extract multilang titles
title_en = title_fr = title_ge = ""
title_en_regex = re.compile(r"^Focus.*$", re.MULTILINE)
title_fr_regex = re.compile(r"^Dossier particulier.*$", re.MULTILINE)
title_ge_regex = re.compile(r"^Schwerpunktthema.*$", re.MULTILINE)
title_fr_mo = title_fr_regex.search(cover_text)
title_en_mo = title_en_regex.search(cover_text)
title_ge_mo = title_ge_regex.search(cover_text)
if title_en_mo is not None:
title_en = title_en_mo.group()
if title_fr_mo is not None:
title_fr = title_fr_mo.group()
if title_ge_mo is not None:
title_ge = title_ge_mo.group()
# extract toc from title page
legacy_toc = ""
toc_regex = re.compile(r"^Schwerpunktthema.*?$(.*)", flags=re.MULTILINE | re.DOTALL)
toc_mo = toc_regex.search(cover_text)
if toc_mo is not None:
legacy_toc = toc_mo.group(1).replace("\n", "<br>")
return PDFExtractResult(
title_en=title_en,
title_fr=title_fr,
title_ge=title_ge,
legacy_toc=legacy_toc,
error=None,
)
[docs]
def extract_focuses(contentfile):
"""
Extracts the focus titles of an issue.
Expects an instance of a `ContentFile`
Sources for focuses (in that ordering):
1. pdf cover page (including toc) DISABLED
2. pdf metadata subject (no toc)
"""
# metadata = extract_focuses_from_pdfcoverpage(contentfile)
# if not all([*metadata]):
return extract_focuses_from_pdfmetadata(contentfile)
[docs]
def get_wordcloud_dir():
# Todo: refactor get and set wordcloud
base_path = pathlib.Path(settings.MEDIA_ROOT)
wordcloud_path = base_path / "issue" / "wordcloud"
wordcloud_path.mkdir(parents=True, exist_ok=True)
return wordcloud_path
[docs]
def generate_wordcloud(contentfile, filename, custom_stopwords=None):
text = pdftotext.PDF(contentfile)
text = " ".join(text)
module_dir = os.path.dirname(__file__) # get current directory
module_stopwords_de = set(
map(
str.strip,
open(
os.path.join(module_dir, "stopwords_de.txt"), encoding="utf-8"
).readlines(),
)
)
# we are working with sets here:
stopwords = STOPWORDS.union(module_stopwords_de, custom_stopwords)
# stopwords.add("EU")
# star_mask = np.array(Image.open(os.path.join(module_dir, "star.png")))
wordcloud = WordCloud(
width=1000,
height=1000,
background_color="white",
max_words=150,
stopwords=stopwords,
# mask=star_mask,
).generate(text)
filename = "{}.png".format(filename)
wordcloud_file = get_wordcloud_dir() / filename
bitmap = wordcloud.to_image()
bitmap.save(wordcloud_file, "PNG")
return
[docs]
def get_wordcloud_bitmap(instance):
filename = "{}-0{}.png".format(instance.year, instance.issue_number)
wordcloud_abs_path = get_wordcloud_dir() / filename
wordcloud_rel_path = f"{wordcloud_abs_path.relative_to(settings.MEDIA_ROOT.parent)}"
try:
with wordcloud_abs_path.open():
# print("wordcloud file found")
return wordcloud_rel_path
except FileNotFoundError:
logger.warning("Wordcloud file for issue {} does not exist.".format(instance))