hugo-site/scripts/tex_deref.py
2023-04-23 13:14:23 +02:00

450 lines
15 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Command line utility for converting a tex source file into a markdown document."""
import sys
import re
from os import path
from pathlib import Path
from typing import List, Tuple, NamedTuple
class Section(NamedTuple):
"""Structured Figure Item."""
span: Tuple[int, int]
index: int
level: int
name: str
label: str
@property
def markdown(self) -> str:
"""Markdown string for this section."""
return "#" * self.level + f" {self.index} {self.name}\n\n"
class Figure(NamedTuple):
"""Structured Figure Item."""
span: Tuple[int, int]
index: int
files: List[str]
caption: str
label: str
@property
def markdown(self) -> str:
"""Markdown string for this figure."""
fig_str = ""
for file in self.files[:-1]:
fig_str += "{{" + f'< figure src="{file}" width="500" >' + "}}\n"
fig_str += (
"{{"
+ f'< figure src="{self.files[-1] if self.files else ""}" title="Figure {self.index}: {self.caption}" width="500" >'
+ "}}\n"
)
return fig_str
class Equation(NamedTuple):
"""Structured Equation Item."""
span: Tuple[int, int]
index: int
expression: str
label: str
@property
def markdown(self) -> str:
"""Markdown string for this equation."""
return f"$$ {self.expression} $$"
class Table(NamedTuple):
"""Structured Table Item."""
span: Tuple[int, int]
index: int
caption: str
content: List[List[str]]
footer: str
label: str
@property
def markdown(self) -> str:
"""Markdown string for this table."""
tbl_str = f"Table {self.index}: {self.caption}\n"
for index, line in enumerate(self.content):
tbl_str += "|" + "|".join(line).replace("\\\\", "") + "|" + "\n"
if index == 0:
tbl_str += (
"|" + "|".join(["----" for elem in line]) + "|" + "\n"
)
tbl_str += self.footer
return tbl_str
class Citation(NamedTuple):
"""Structured Citation Item."""
index: int
name: str
label: str
class LatexFile:
def __init__(self, src_file: Path):
sys_path = path.abspath(src_file)
src_dir = path.dirname(sys_path)
src_file = path.basename(sys_path)
self.tex_src = self.flatten_input("\\input{" + src_file + "}", src_dir)
self.filter_tex(sys_path.replace(".tex", ".bbl"))
@classmethod
def first(cls, list: List[str]) -> str:
"""Fetch the first optional element else return empty string."""
return list[0] if list else ""
@property
def figures(self) -> List[Figure]:
"""Parse TEX contents for context eces."""
return [
Figure(
span=(begin.start(), stop.end()),
index=index + 1,
files=[
elem[1]
for elem in re.findall(
"\\\\includegraphics(.*)\{(.*)\}",
self.tex_src[begin.start() : stop.end()],
)
],
caption=self.first(
re.findall(
"\\\\caption\{(.*)\}",
self.tex_src[begin.start() : stop.end()],
)
),
label=self.first(
re.findall(
"\\\\label\{(.*)\}",
self.tex_src[begin.start() : stop.end()],
)
),
)
for index, (begin, stop) in enumerate(
zip(
re.finditer("\\\\begin\{figure\*?\}", self.tex_src),
re.finditer("\\\\end\{figure\*?\}", self.tex_src),
)
)
]
@property
def sections(self) -> List[Section]:
"""Parse TEX contents for context refereces."""
sec_list = []
for index, match in enumerate(
re.finditer(r"\\(sub)*(section|chapter)(.*)", self.tex_src)
):
sub_string = self.tex_src[match.start() : match.end()]
label = self.first(re.findall("\\\\label\{(.*)\}", sub_string))
sub_string = re.sub(
r"\\(sub)*(section|chapter)",
"",
sub_string.replace("\\label{" + label + "}", ""),
).strip()
sec_list.append(
Section(
span=(match.start(), match.end()),
index=index + 1,
level=len(match.groups()[0] or "") // 3 + 1,
name=sub_string[1:-1],
label=label,
)
)
return sec_list
@property
def equations(self) -> List[Equation]:
"""Parse TEX contents for context refereces."""
eq_list = []
for index, (begin, stop) in enumerate(
zip(
re.finditer("\\\\begin\{equation\}", self.tex_src),
re.finditer("\\\\end\{equation\}", self.tex_src),
)
):
sub_string = self.tex_src[begin.end() : stop.start()]
label = self.first(re.findall("\\\\label\{(.*)\}", sub_string))
eq_list.append(
Equation(
span=(begin.start(), stop.end()),
index=index + 1,
expression=sub_string.replace(
"\\label{" + label + "}", ""
).replace("\n", ""),
label=label,
)
)
return eq_list
@property
def tables(self) -> List[Table]:
"""Parse TEX contents for context refereces."""
tbl_list = []
for index, (begin, stop) in enumerate(
zip(
re.finditer("\\\\begin\{table\*?\}", self.tex_src),
re.finditer("\\\\end\{table\*?\}", self.tex_src),
)
):
sub_string = self.tex_src[begin.end() : stop.start()]
label = self.first(re.findall("\\\\label\{(.*)\}", sub_string))
caption = self.first(re.findall("\\\\caption\{(.*)\}", sub_string))
footer = sub_string[
re.search("\\\\end\{tabular\}", sub_string).end() :
].replace("\n", "")
sub_string = sub_string[
re.search("\\\\begin\{tabular\}", sub_string)
.end() : re.search("\\\\end\{tabular\}", sub_string)
.start()
]
content = [line.split("&") for line in sub_string.split("\n")[1:]]
[
content.pop(row - 1)
for row in range(len(content), 0, -1)
if len(content[row - 1]) <= 1
]
tbl_list.append(
Table(
span=(begin.start(), stop.end()),
index=index + 1,
caption=caption,
content=content,
footer=footer,
label=label,
)
)
return tbl_list
def replace_figures(self) -> None:
"""Dereference and replace all figures with markdown formatting."""
fig_list = self.figures
fig_list.reverse()
for figure in fig_list:
self.tex_src = (
self.tex_src[: figure.span[0]]
+ figure.markdown
+ self.tex_src[figure.span[1] :]
)
for figure in fig_list:
self.tex_src = re.sub(
"\\\\ref\{" + figure.label + "\}",
str(figure.index),
self.tex_src,
)
def replace_tables(self) -> None:
"""Dereference and replace all tables with markdown formatting."""
tbl_list = self.tables
tbl_list.reverse()
for table in tbl_list:
self.tex_src = (
self.tex_src[: table.span[0]]
+ table.markdown
+ self.tex_src[table.span[1] :]
)
for table in tbl_list:
self.tex_src = re.sub(
"\\\\ref\{" + table.label + "\}",
str(table.index),
self.tex_src,
)
def replace_equations(self) -> None:
"""Dereference and replace all equations with markdown formatting."""
eq_list = self.equations
eq_list.reverse()
for equation in eq_list:
self.tex_src = (
self.tex_src[: equation.span[0]]
+ equation.markdown
+ self.tex_src[equation.span[1] :]
)
for equation in eq_list:
self.tex_src = re.sub(
"\\\\ref\{" + equation.label + "\}",
str(equation.index),
self.tex_src,
)
@classmethod
def parse_bbl(cls, lines: List[str]) -> List[Citation]:
"""Parse BBL contents for bibtec refereces."""
return [
Citation(
label=re.match("\\\\bibitem{([0-z\.\-_]*)}", entry).groups()[
0
],
index=index + 1,
name=re.sub(
"\\\\emph|\\\\BIBentry[A-z]*wordspacing|\\\\bibitem\{([0-z]*)\}|[\{\}\n~]|\\\\url",
"",
entry,
),
)
for index, entry in enumerate("".join(lines).split("\n\n")[1:-1])
]
@classmethod
def md_bbl(cls, src_bbl: List[Citation]) -> str:
bbl_str = "# References:\n\n"
src_bbl.sort(key=lambda x: x.index)
for citation in src_bbl:
bbl_str += f"[^{citation.index}]: {citation.name}\n"
return bbl_str
def replace_sections(self) -> None:
"""Dereference and replace all sections with markdown formatting."""
sc_list = self.sections
sc_list.reverse()
for section in sc_list:
self.tex_src = (
self.tex_src[: section.span[0]]
+ section.markdown
+ self.tex_src[section.span[1] :]
)
for sc in sc_list:
self.tex_src = re.sub(
"\\\\ref\{" + sc.label + "\}", str(sc.index), self.tex_src
)
def preprocess(self) -> None:
"""Prep proceedure for customized formatting."""
custom_rules = [
(r"\n(\w)*%.*", ""),
(r"}(\n)*\\label", r"}\\label"),
(r" %.*", ""),
(r"\\usection", r"\\section"),
(r"\\usubsection", r"\\subsection"),
(r"\\begin{abstract}", r"\\section{Abstract}"),
(r"\\end{abstract}", r""),
(r"\\maketitle", r""),
(r"\\IEEEpeerreviewmaketitle", r""),
(r"\\bstctlcite{[0-z:]*}", r""),
(r"\\clearpage", r""),
(r"\\pagebreak", r""),
(r"\\\\[ ]*", r""),
(r"\$\\,\$", r" "),
(r"\$([0-z\\\.\-\+_,{}%\(\)\/]*)\$", r"\\\\(\g<1>\\\\)"),
(r"\\flushleft", r""),
(r"\\begin{flushleft}", r""),
(r"\\end{flushleft}", r""),
(r"\\begin{center}", r""),
(r"\\end{center}", r""),
]
for rule, result in custom_rules:
self.tex_src = re.sub(rule, result, self.tex_src)
def postprocess(self) -> None:
"""Clean up proceedure for customized formatting."""
custom_rules = [
(r"\\,", r" "),
(r"~", r" "),
(r"\\tsqrd", r"²"),
(r"\\rpi", r"π"),
(r"\\rmu", r"μ"),
(r"\\rbeta", r"β"),
(r"\\ralpha", r"α"),
(r"\\rDelta", r"Δ"),
(r"\\rdelta", r"δ"),
(r"\\rsigma", r"σ"),
(r"\\rSigma", r"Σ"),
(r"\\rtau", r"τ"),
(r"\\reta", r"η"),
(r"\\rphi", r"φ"),
(r"\\rPhi", r"Φ"),
(r"\\romeg", r"ω"),
(r"\\rOmeg", r"Ω"),
(r"\\vspace\{[0-z \.\-\+]*\}", r""),
(r"\\hspace\{[0-z \.\-\+]*\}", r""),
(r"\\vfill", r""),
(r"\\hfill", r""),
(r" \\& ", r" & "),
(r"\$\\pm\$", "±"),
(r"\\tss\{([0-z,_\. ]*)\}", r"<sub>\g<1><sub>"),
(r"\\tps\{([0-z,_\. ]*)\}", r"<sup>\g<1><sup>"),
(r"\\textbf\{([0-z\.,_ ]*)\}", r"**\g<1>**"),
(r"\\sqrt", r""),
(r"\\%", "%"),
]
for rule, result in custom_rules:
self.tex_src = re.sub(rule, result, self.tex_src)
@classmethod
def flatten_input(cls, tex_src: str, basedir: Path = Path(".")) -> str:
"""Recusive method for generating a flattened latex source."""
for source_file in re.findall("input\{([0-z\/\.\_\-]*)\}", tex_src):
tex_input = list()
with open(
path.join(basedir, source_file), "r", encoding="utf8"
) as input:
for line in input.readlines():
tex_input.append(
line
if not re.findall("input\{([0-z\/\.\_\-]*)\}", line)
else cls.flatten_input(line, basedir=basedir)
)
tex_src = tex_src.replace(
"\\input{" + source_file + "}", "".join(tex_input)
)
return tex_src
def replace_references(self, bbl_file: Path) -> None:
with open(bbl_file, "r") as file:
bbl_list = self.parse_bbl(file.readlines())
bbl_list.sort(key=lambda x: x.label)
bbl_list.reverse()
for bbl in bbl_list:
self.tex_src = re.sub(
"\\\\cite{([0-z,\.\-_]*)("
+ bbl.label
+ ")([0-z,\.\-_]*)}",
f"[^{bbl.index}]" + "\\\\cite{\g<1>\g<3>}",
self.tex_src,
)
self.tex_src = re.sub(
r"\\bibliographystyle\{[0-z,\-_\/\.]*\}", "", self.tex_src
)
self.tex_src = re.sub(
r"\\bibliography\{[0-z,\-_\.\/]*\}",
self.md_bbl(bbl_list),
self.tex_src,
)
self.tex_src = re.sub(r"\\cite{[,]*}", "", self.tex_src)
def strip_tex(self) -> None:
"""Clear default TEX preable - postamble."""
begin = re.search(r"\\begin\{document\}", self.tex_src).end()
end = re.search(r"\\end\{document\}", self.tex_src).start()
self.tex_src = self.tex_src[begin:end]
def filter_tex(self, bbl_file: Path) -> None:
"""Default TEX filterting proceedure."""
self.strip_tex()
self.preprocess()
self.replace_references(bbl_file)
self.replace_figures()
self.replace_tables()
self.replace_equations()
self.replace_sections()
self.postprocess()
for file in sys.argv[1:]:
print(LatexFile(file).tex_src)
# eof