Source code for eucrim.article.utils

# SPDX-FileCopyrightText: 2024 Thomas Breitner <t.breitner@csl.mpg.de>
#
# SPDX-License-Identifier: EUPL-1.2

import os
import re
import tempfile
from collections import namedtuple
from urllib.parse import urlparse

import pypandoc
from bs4 import BeautifulSoup
from pybtex.database import BibliographyData, Entry

from django.conf import settings
from django.utils.text import Truncator
from django.utils.html import strip_tags, escape



[docs]
def word2html(cleaned_data, page_id=None):
    print("[ * ] Process word2html...")
    print("[ * ] cwd: ", os.getcwd())
    print("[ * ] instance: ", cleaned_data["word_file"])
    print("[ * ] type instance: ", type(cleaned_data["word_file"]))

    article_img_dir = "{}/{}".format(
        str(os.path.join(settings.MEDIA_ROOT, "article_img")),
        cleaned_data["slug"],
    )
    article_file = cleaned_data["word_file"]

    article_filename, _article_file_extension = os.path.splitext(article_file.name)
    article_file_extension = _article_file_extension.replace(".", "")

    # print('[ * ] article_file: {}'.format(article_file))
    # print("[ * ] article_filename: ", article_filename)
    # print('[ * ] article_file_extension: {}'.format(article_file_extension))

    pdoc_args = [
        # '--smart',  # removed in pandoc 2, now avail as an pandoc extension
        # '--normalize',  # removed/default in pandoc 2
        # '--section-divs',
        # '--number-sections',
        # '--standalone',
        # '--toc',
        # --template not needed at the moment, as we do not generate the toc via
        # pandoc in 'standalone' mode (see: pdoc_args)
        # '--template={str(settings.PANDOC_TEMPLATE_HTML5)}'
        # f"--id-prefix=docx-to-html-{os.urandom(4).hex()}-",
        f"--extract-media={article_img_dir}",
        "--mathml",
    ]

    with tempfile.NamedTemporaryFile() as f:
        f.write(article_file.read())
        f.seek(0)

        print("[ f ]: ", f)
        print("[ f ] name: ", f.name)
        print("[ f ] type: ", type(f))

        docx_to_html5 = pypandoc.convert_file(
            f.name,
            format=article_file_extension,  # docx, odt
            to="html5",
            extra_args=pdoc_args,
        )

    # correct relatvive image links
    # from
    #    <img src="/home/breitner/projects/eucrim/media...
    # or
    #    <img\nsrc="
    #      /home/breitner/projects/eucrim/data/media/article_img/asdfsadf/media/image1.jpg"\nstyle="width:5.28667in;height:3.29333in" />
    # to   <img src="/media...
    _media_root_path = str(settings.MEDIA_ROOT)
    re_to_search = rf"<img(\\n|\s+)src=\"(\\n|\s*){_media_root_path}/"
    str_to_replace = f'<img src="{str(settings.MEDIA_URL)}'

    as_html5 = re.sub(re_to_search, str_to_replace, docx_to_html5)

    # Find all <table> elements and wrap them in a <div>
    soup = BeautifulSoup(as_html5, "html.parser")
    for table in soup.select("table"):
        # Add Bootstrap table class
        table["class"] = table.get("class", []) + ["table table-sm"]

        # Add Bootstrap responsive table wrapper
        wrapper = soup.new_tag("div", **{"class": "table-responsive"})
        table.insert_before(wrapper)
        wrapper.append(table)

    # Convert the modified HTML back to a string
    as_html5 = str(soup)

    # Convert html to plain text (the pandoc plain output format uses uppercase for titles etc.)
    _text = escape(strip_tags(as_html5))

    # store a truncated version of the text
    auto_excerpt = "{}".format(Truncator(_text).words(100))

    Article = namedtuple(
        "Article",
        [
            "article_filename",
            "article_file_extension",
            "as_html5",
            "auto_excerpt",
        ],
    )

    return Article(article_filename, article_file_extension, as_html5, auto_excerpt)




[docs]
def export_bibtex(self):
    # now via RoutablePageMixin without persistent file object
    # http://docs.wagtail.io/en/v2.1/reference/contrib/routablepage.html
    print("[ * ] Generating the bibtex file...")

    # bibtex_dir = os.path.join(settings.MEDIA_ROOT, 'bibtex')
    # os.makedirs(bibtex_dir, exist_ok=True)
    bibtex_filename = "{}.bib".format(self.slug)
    # bibtex_file = os.path.join(bibtex_dir, bibtex_filename)

    authors = []
    for author in self.authors:
        authors.append("{} {}".format(author.first_name, author.last_name))
    authors = " and ".join(authors)

    doi_url = self.get_doi if self.get_doi is not None else ""
    doi = urlparse(doi_url).path[1:]

    try:
        year = str(self.publication_date.year)
    except AttributeError:
        year = ""

    bib_entry = [
        ("author", authors),
        ("title", self.title),
        ("journal", "eucrim"),
        # todo: fix year; can't access first_published_at for new
        # instances here
        ("year", year),
    ]

    if self.doi_id:
        bib_entry.extend(
            [
                ("doi", doi),
                ("url", doi_url),
            ]
        )

    bibtex_data = BibliographyData({self.slug: Entry("article", bib_entry)})
    bibtex_data = bibtex_data.to_string(bib_format="bibtex")
    # return bib_data.to_file(bibtex_file, bib_format='bibtex')

    # Multiple return values as namedtuple:
    # see: https://dbader.org/blog/writing-clean-python-with-namedtuples
    Bibtex = namedtuple(
        "Bibtex",
        [
            "data",
            "filename",
        ],
    )

    return Bibtex(
        bibtex_data,
        bibtex_filename,
    )