# SPDX-FileCopyrightText: 2024 Thomas Breitner <t.breitner@csl.mpg.de>
#
# SPDX-License-Identifier: EUPL-1.2
import os
import re
import tempfile
from collections import namedtuple
from urllib.parse import urlparse
import pypandoc
from bs4 import BeautifulSoup
from pybtex.database import BibliographyData, Entry
from django.conf import settings
from django.utils.text import Truncator
from django.utils.html import strip_tags, escape
[docs]
def word2html(cleaned_data, page_id=None):
print("[ * ] Process word2html...")
print("[ * ] cwd: ", os.getcwd())
print("[ * ] instance: ", cleaned_data["word_file"])
print("[ * ] type instance: ", type(cleaned_data["word_file"]))
article_img_dir = "{}/{}".format(
str(os.path.join(settings.MEDIA_ROOT, "article_img")),
cleaned_data["slug"],
)
article_file = cleaned_data["word_file"]
article_filename, _article_file_extension = os.path.splitext(article_file.name)
article_file_extension = _article_file_extension.replace(".", "")
# print('[ * ] article_file: {}'.format(article_file))
# print("[ * ] article_filename: ", article_filename)
# print('[ * ] article_file_extension: {}'.format(article_file_extension))
pdoc_args = [
# '--smart', # removed in pandoc 2, now avail as an pandoc extension
# '--normalize', # removed/default in pandoc 2
# '--section-divs',
# '--number-sections',
# '--standalone',
# '--toc',
# --template not needed at the moment, as we do not generate the toc via
# pandoc in 'standalone' mode (see: pdoc_args)
# '--template={str(settings.PANDOC_TEMPLATE_HTML5)}'
# f"--id-prefix=docx-to-html-{os.urandom(4).hex()}-",
f"--extract-media={article_img_dir}",
"--mathml",
]
with tempfile.NamedTemporaryFile() as f:
f.write(article_file.read())
f.seek(0)
print("[ f ]: ", f)
print("[ f ] name: ", f.name)
print("[ f ] type: ", type(f))
docx_to_html5 = pypandoc.convert_file(
f.name,
format=article_file_extension, # docx, odt
to="html5",
extra_args=pdoc_args,
)
# correct relatvive image links
# from
# <img src="/home/breitner/projects/eucrim/media...
# or
# <img\nsrc="
# /home/breitner/projects/eucrim/data/media/article_img/asdfsadf/media/image1.jpg"\nstyle="width:5.28667in;height:3.29333in" />
# to <img src="/media...
_media_root_path = str(settings.MEDIA_ROOT)
re_to_search = rf"<img(\\n|\s+)src=\"(\\n|\s*){_media_root_path}/"
str_to_replace = f'<img src="{str(settings.MEDIA_URL)}'
as_html5 = re.sub(re_to_search, str_to_replace, docx_to_html5)
# Find all <table> elements and wrap them in a <div>
soup = BeautifulSoup(as_html5, "html.parser")
for table in soup.select("table"):
# Add Bootstrap table class
table["class"] = table.get("class", []) + ["table table-sm"]
# Add Bootstrap responsive table wrapper
wrapper = soup.new_tag("div", **{"class": "table-responsive"})
table.insert_before(wrapper)
wrapper.append(table)
# Convert the modified HTML back to a string
as_html5 = str(soup)
# Convert html to plain text (the pandoc plain output format uses uppercase for titles etc.)
_text = escape(strip_tags(as_html5))
# store a truncated version of the text
auto_excerpt = "{}".format(Truncator(_text).words(100))
Article = namedtuple(
"Article",
[
"article_filename",
"article_file_extension",
"as_html5",
"auto_excerpt",
],
)
return Article(article_filename, article_file_extension, as_html5, auto_excerpt)
[docs]
def export_bibtex(self):
# now via RoutablePageMixin without persistent file object
# http://docs.wagtail.io/en/v2.1/reference/contrib/routablepage.html
print("[ * ] Generating the bibtex file...")
# bibtex_dir = os.path.join(settings.MEDIA_ROOT, 'bibtex')
# os.makedirs(bibtex_dir, exist_ok=True)
bibtex_filename = "{}.bib".format(self.slug)
# bibtex_file = os.path.join(bibtex_dir, bibtex_filename)
authors = []
for author in self.authors:
authors.append("{} {}".format(author.first_name, author.last_name))
authors = " and ".join(authors)
doi_url = self.get_doi if self.get_doi is not None else ""
doi = urlparse(doi_url).path[1:]
try:
year = str(self.publication_date.year)
except AttributeError:
year = ""
bib_entry = [
("author", authors),
("title", self.title),
("journal", "eucrim"),
# todo: fix year; can't access first_published_at for new
# instances here
("year", year),
]
if self.doi_id:
bib_entry.extend(
[
("doi", doi),
("url", doi_url),
]
)
bibtex_data = BibliographyData({self.slug: Entry("article", bib_entry)})
bibtex_data = bibtex_data.to_string(bib_format="bibtex")
# return bib_data.to_file(bibtex_file, bib_format='bibtex')
# Multiple return values as namedtuple:
# see: https://dbader.org/blog/writing-clean-python-with-namedtuples
Bibtex = namedtuple(
"Bibtex",
[
"data",
"filename",
],
)
return Bibtex(
bibtex_data,
bibtex_filename,
)