mirror of
https://github.com/lleene/hugo-site.git
synced 2025-01-23 03:52:21 +01:00
450 lines
15 KiB
Python
450 lines
15 KiB
Python
#!/usr/bin/env python3
|
||
|
||
"""Command line utility for converting a tex source file into a markdown document."""
|
||
|
||
import sys
|
||
import re
|
||
from os import path
|
||
from pathlib import Path
|
||
from typing import List, Tuple, NamedTuple
|
||
|
||
|
||
class Section(NamedTuple):
|
||
"""Structured Figure Item."""
|
||
|
||
span: Tuple[int, int]
|
||
index: int
|
||
level: int
|
||
name: str
|
||
label: str
|
||
|
||
@property
|
||
def markdown(self) -> str:
|
||
"""Markdown string for this section."""
|
||
return "#" * self.level + f" {self.index} {self.name}\n\n"
|
||
|
||
|
||
class Figure(NamedTuple):
|
||
"""Structured Figure Item."""
|
||
|
||
span: Tuple[int, int]
|
||
index: int
|
||
files: List[str]
|
||
caption: str
|
||
label: str
|
||
|
||
@property
|
||
def markdown(self) -> str:
|
||
"""Markdown string for this figure."""
|
||
fig_str = ""
|
||
for file in self.files[:-1]:
|
||
fig_str += "{{" + f'< figure src="{file}" width="500" >' + "}}\n"
|
||
fig_str += (
|
||
"{{"
|
||
+ f'< figure src="{self.files[-1] if self.files else ""}" title="Figure {self.index}: {self.caption}" width="500" >'
|
||
+ "}}\n"
|
||
)
|
||
return fig_str
|
||
|
||
|
||
class Equation(NamedTuple):
|
||
"""Structured Equation Item."""
|
||
|
||
span: Tuple[int, int]
|
||
index: int
|
||
expression: str
|
||
label: str
|
||
|
||
@property
|
||
def markdown(self) -> str:
|
||
"""Markdown string for this equation."""
|
||
return f"$$ {self.expression} $$"
|
||
|
||
|
||
class Table(NamedTuple):
|
||
"""Structured Table Item."""
|
||
|
||
span: Tuple[int, int]
|
||
index: int
|
||
caption: str
|
||
content: List[List[str]]
|
||
footer: str
|
||
label: str
|
||
|
||
@property
|
||
def markdown(self) -> str:
|
||
"""Markdown string for this table."""
|
||
tbl_str = f"Table {self.index}: {self.caption}\n"
|
||
for index, line in enumerate(self.content):
|
||
tbl_str += "|" + "|".join(line).replace("\\\\", "") + "|" + "\n"
|
||
if index == 0:
|
||
tbl_str += (
|
||
"|" + "|".join(["----" for elem in line]) + "|" + "\n"
|
||
)
|
||
tbl_str += self.footer
|
||
return tbl_str
|
||
|
||
|
||
class Citation(NamedTuple):
|
||
"""Structured Citation Item."""
|
||
|
||
index: int
|
||
name: str
|
||
label: str
|
||
|
||
|
||
class LatexFile:
|
||
def __init__(self, src_file: Path):
|
||
sys_path = path.abspath(src_file)
|
||
src_dir = path.dirname(sys_path)
|
||
src_file = path.basename(sys_path)
|
||
self.tex_src = self.flatten_input("\\input{" + src_file + "}", src_dir)
|
||
self.filter_tex(sys_path.replace(".tex", ".bbl"))
|
||
|
||
@classmethod
|
||
def first(cls, list: List[str]) -> str:
|
||
"""Fetch the first optional element else return empty string."""
|
||
return list[0] if list else ""
|
||
|
||
@property
|
||
def figures(self) -> List[Figure]:
|
||
"""Parse TEX contents for context eces."""
|
||
return [
|
||
Figure(
|
||
span=(begin.start(), stop.end()),
|
||
index=index + 1,
|
||
files=[
|
||
elem[1]
|
||
for elem in re.findall(
|
||
"\\\\includegraphics(.*)\{(.*)\}",
|
||
self.tex_src[begin.start() : stop.end()],
|
||
)
|
||
],
|
||
caption=self.first(
|
||
re.findall(
|
||
"\\\\caption\{(.*)\}",
|
||
self.tex_src[begin.start() : stop.end()],
|
||
)
|
||
),
|
||
label=self.first(
|
||
re.findall(
|
||
"\\\\label\{(.*)\}",
|
||
self.tex_src[begin.start() : stop.end()],
|
||
)
|
||
),
|
||
)
|
||
for index, (begin, stop) in enumerate(
|
||
zip(
|
||
re.finditer("\\\\begin\{figure\*?\}", self.tex_src),
|
||
re.finditer("\\\\end\{figure\*?\}", self.tex_src),
|
||
)
|
||
)
|
||
]
|
||
|
||
@property
|
||
def sections(self) -> List[Section]:
|
||
"""Parse TEX contents for context refereces."""
|
||
sec_list = []
|
||
for index, match in enumerate(
|
||
re.finditer(r"\\(sub)*(section|chapter)(.*)", self.tex_src)
|
||
):
|
||
sub_string = self.tex_src[match.start() : match.end()]
|
||
label = self.first(re.findall("\\\\label\{(.*)\}", sub_string))
|
||
sub_string = re.sub(
|
||
r"\\(sub)*(section|chapter)",
|
||
"",
|
||
sub_string.replace("\\label{" + label + "}", ""),
|
||
).strip()
|
||
sec_list.append(
|
||
Section(
|
||
span=(match.start(), match.end()),
|
||
index=index + 1,
|
||
level=len(match.groups()[0] or "") // 3 + 1,
|
||
name=sub_string[1:-1],
|
||
label=label,
|
||
)
|
||
)
|
||
return sec_list
|
||
|
||
@property
|
||
def equations(self) -> List[Equation]:
|
||
"""Parse TEX contents for context refereces."""
|
||
eq_list = []
|
||
for index, (begin, stop) in enumerate(
|
||
zip(
|
||
re.finditer("\\\\begin\{equation\}", self.tex_src),
|
||
re.finditer("\\\\end\{equation\}", self.tex_src),
|
||
)
|
||
):
|
||
sub_string = self.tex_src[begin.end() : stop.start()]
|
||
label = self.first(re.findall("\\\\label\{(.*)\}", sub_string))
|
||
eq_list.append(
|
||
Equation(
|
||
span=(begin.start(), stop.end()),
|
||
index=index + 1,
|
||
expression=sub_string.replace(
|
||
"\\label{" + label + "}", ""
|
||
).replace("\n", ""),
|
||
label=label,
|
||
)
|
||
)
|
||
return eq_list
|
||
|
||
@property
|
||
def tables(self) -> List[Table]:
|
||
"""Parse TEX contents for context refereces."""
|
||
tbl_list = []
|
||
for index, (begin, stop) in enumerate(
|
||
zip(
|
||
re.finditer("\\\\begin\{table\*?\}", self.tex_src),
|
||
re.finditer("\\\\end\{table\*?\}", self.tex_src),
|
||
)
|
||
):
|
||
sub_string = self.tex_src[begin.end() : stop.start()]
|
||
label = self.first(re.findall("\\\\label\{(.*)\}", sub_string))
|
||
caption = self.first(re.findall("\\\\caption\{(.*)\}", sub_string))
|
||
footer = sub_string[
|
||
re.search("\\\\end\{tabular\}", sub_string).end() :
|
||
].replace("\n", "")
|
||
sub_string = sub_string[
|
||
re.search("\\\\begin\{tabular\}", sub_string)
|
||
.end() : re.search("\\\\end\{tabular\}", sub_string)
|
||
.start()
|
||
]
|
||
content = [line.split("&") for line in sub_string.split("\n")[1:]]
|
||
[
|
||
content.pop(row - 1)
|
||
for row in range(len(content), 0, -1)
|
||
if len(content[row - 1]) <= 1
|
||
]
|
||
tbl_list.append(
|
||
Table(
|
||
span=(begin.start(), stop.end()),
|
||
index=index + 1,
|
||
caption=caption,
|
||
content=content,
|
||
footer=footer,
|
||
label=label,
|
||
)
|
||
)
|
||
return tbl_list
|
||
|
||
def replace_figures(self) -> None:
|
||
"""Dereference and replace all figures with markdown formatting."""
|
||
fig_list = self.figures
|
||
fig_list.reverse()
|
||
for figure in fig_list:
|
||
self.tex_src = (
|
||
self.tex_src[: figure.span[0]]
|
||
+ figure.markdown
|
||
+ self.tex_src[figure.span[1] :]
|
||
)
|
||
for figure in fig_list:
|
||
self.tex_src = re.sub(
|
||
"\\\\ref\{" + figure.label + "\}",
|
||
str(figure.index),
|
||
self.tex_src,
|
||
)
|
||
|
||
def replace_tables(self) -> None:
|
||
"""Dereference and replace all tables with markdown formatting."""
|
||
tbl_list = self.tables
|
||
tbl_list.reverse()
|
||
for table in tbl_list:
|
||
self.tex_src = (
|
||
self.tex_src[: table.span[0]]
|
||
+ table.markdown
|
||
+ self.tex_src[table.span[1] :]
|
||
)
|
||
for table in tbl_list:
|
||
self.tex_src = re.sub(
|
||
"\\\\ref\{" + table.label + "\}",
|
||
str(table.index),
|
||
self.tex_src,
|
||
)
|
||
|
||
def replace_equations(self) -> None:
|
||
"""Dereference and replace all equations with markdown formatting."""
|
||
eq_list = self.equations
|
||
eq_list.reverse()
|
||
for equation in eq_list:
|
||
self.tex_src = (
|
||
self.tex_src[: equation.span[0]]
|
||
+ equation.markdown
|
||
+ self.tex_src[equation.span[1] :]
|
||
)
|
||
for equation in eq_list:
|
||
self.tex_src = re.sub(
|
||
"\\\\ref\{" + equation.label + "\}",
|
||
str(equation.index),
|
||
self.tex_src,
|
||
)
|
||
|
||
@classmethod
|
||
def parse_bbl(cls, lines: List[str]) -> List[Citation]:
|
||
"""Parse BBL contents for bibtec refereces."""
|
||
return [
|
||
Citation(
|
||
label=re.match("\\\\bibitem{([0-z\.\-_]*)}", entry).groups()[
|
||
0
|
||
],
|
||
index=index + 1,
|
||
name=re.sub(
|
||
"\\\\emph|\\\\BIBentry[A-z]*wordspacing|\\\\bibitem\{([0-z]*)\}|[\{\}\n~]|\\\\url",
|
||
"",
|
||
entry,
|
||
),
|
||
)
|
||
for index, entry in enumerate("".join(lines).split("\n\n")[1:-1])
|
||
]
|
||
|
||
@classmethod
|
||
def md_bbl(cls, src_bbl: List[Citation]) -> str:
|
||
bbl_str = "# References:\n\n"
|
||
src_bbl.sort(key=lambda x: x.index)
|
||
for citation in src_bbl:
|
||
bbl_str += f"[^{citation.index}]: {citation.name}\n"
|
||
return bbl_str
|
||
|
||
def replace_sections(self) -> None:
|
||
"""Dereference and replace all sections with markdown formatting."""
|
||
sc_list = self.sections
|
||
sc_list.reverse()
|
||
for section in sc_list:
|
||
self.tex_src = (
|
||
self.tex_src[: section.span[0]]
|
||
+ section.markdown
|
||
+ self.tex_src[section.span[1] :]
|
||
)
|
||
for sc in sc_list:
|
||
self.tex_src = re.sub(
|
||
"\\\\ref\{" + sc.label + "\}", str(sc.index), self.tex_src
|
||
)
|
||
|
||
def preprocess(self) -> None:
|
||
"""Prep proceedure for customized formatting."""
|
||
custom_rules = [
|
||
(r"\n(\w)*%.*", ""),
|
||
(r"}(\n)*\\label", r"}\\label"),
|
||
(r" %.*", ""),
|
||
(r"\\usection", r"\\section"),
|
||
(r"\\usubsection", r"\\subsection"),
|
||
(r"\\begin{abstract}", r"\\section{Abstract}"),
|
||
(r"\\end{abstract}", r""),
|
||
(r"\\maketitle", r""),
|
||
(r"\\IEEEpeerreviewmaketitle", r""),
|
||
(r"\\bstctlcite{[0-z:]*}", r""),
|
||
(r"\\clearpage", r""),
|
||
(r"\\pagebreak", r""),
|
||
(r"\\\\[ ]*", r""),
|
||
(r"\$\\,\$", r" "),
|
||
(r"\$([0-z\\\.\-\+_,{}%\(\)\/]*)\$", r"\\\\(\g<1>\\\\)"),
|
||
(r"\\flushleft", r""),
|
||
(r"\\begin{flushleft}", r""),
|
||
(r"\\end{flushleft}", r""),
|
||
(r"\\begin{center}", r""),
|
||
(r"\\end{center}", r""),
|
||
]
|
||
for rule, result in custom_rules:
|
||
self.tex_src = re.sub(rule, result, self.tex_src)
|
||
|
||
def postprocess(self) -> None:
|
||
"""Clean up proceedure for customized formatting."""
|
||
custom_rules = [
|
||
(r"\\,", r" "),
|
||
(r"~", r" "),
|
||
(r"\\tsqrd", r"²"),
|
||
(r"\\rpi", r"π"),
|
||
(r"\\rmu", r"μ"),
|
||
(r"\\rbeta", r"β"),
|
||
(r"\\ralpha", r"α"),
|
||
(r"\\rDelta", r"Δ"),
|
||
(r"\\rdelta", r"δ"),
|
||
(r"\\rsigma", r"σ"),
|
||
(r"\\rSigma", r"Σ"),
|
||
(r"\\rtau", r"τ"),
|
||
(r"\\reta", r"η"),
|
||
(r"\\rphi", r"φ"),
|
||
(r"\\rPhi", r"Φ"),
|
||
(r"\\romeg", r"ω"),
|
||
(r"\\rOmeg", r"Ω"),
|
||
(r"\\vspace\{[0-z \.\-\+]*\}", r""),
|
||
(r"\\hspace\{[0-z \.\-\+]*\}", r""),
|
||
(r"\\vfill", r""),
|
||
(r"\\hfill", r""),
|
||
(r" \\& ", r" & "),
|
||
(r"\$\\pm\$", "±"),
|
||
(r"\\tss\{([0-z,_\. ]*)\}", r"<sub>\g<1><sub>"),
|
||
(r"\\tps\{([0-z,_\. ]*)\}", r"<sup>\g<1><sup>"),
|
||
(r"\\textbf\{([0-z\.,_ ]*)\}", r"**\g<1>**"),
|
||
(r"\\sqrt", r"√"),
|
||
(r"\\%", "%"),
|
||
]
|
||
for rule, result in custom_rules:
|
||
self.tex_src = re.sub(rule, result, self.tex_src)
|
||
|
||
@classmethod
|
||
def flatten_input(cls, tex_src: str, basedir: Path = Path(".")) -> str:
|
||
"""Recusive method for generating a flattened latex source."""
|
||
for source_file in re.findall("input\{([0-z\/\.\_\-]*)\}", tex_src):
|
||
tex_input = list()
|
||
with open(
|
||
path.join(basedir, source_file), "r", encoding="utf8"
|
||
) as input:
|
||
for line in input.readlines():
|
||
tex_input.append(
|
||
line
|
||
if not re.findall("input\{([0-z\/\.\_\-]*)\}", line)
|
||
else cls.flatten_input(line, basedir=basedir)
|
||
)
|
||
tex_src = tex_src.replace(
|
||
"\\input{" + source_file + "}", "".join(tex_input)
|
||
)
|
||
return tex_src
|
||
|
||
def replace_references(self, bbl_file: Path) -> None:
|
||
with open(bbl_file, "r") as file:
|
||
bbl_list = self.parse_bbl(file.readlines())
|
||
bbl_list.sort(key=lambda x: x.label)
|
||
bbl_list.reverse()
|
||
for bbl in bbl_list:
|
||
self.tex_src = re.sub(
|
||
"\\\\cite{([0-z,\.\-_]*)("
|
||
+ bbl.label
|
||
+ ")([0-z,\.\-_]*)}",
|
||
f"[^{bbl.index}]" + "\\\\cite{\g<1>\g<3>}",
|
||
self.tex_src,
|
||
)
|
||
self.tex_src = re.sub(
|
||
r"\\bibliographystyle\{[0-z,\-_\/\.]*\}", "", self.tex_src
|
||
)
|
||
self.tex_src = re.sub(
|
||
r"\\bibliography\{[0-z,\-_\.\/]*\}",
|
||
self.md_bbl(bbl_list),
|
||
self.tex_src,
|
||
)
|
||
self.tex_src = re.sub(r"\\cite{[,]*}", "", self.tex_src)
|
||
|
||
def strip_tex(self) -> None:
|
||
"""Clear default TEX preable - postamble."""
|
||
begin = re.search(r"\\begin\{document\}", self.tex_src).end()
|
||
end = re.search(r"\\end\{document\}", self.tex_src).start()
|
||
self.tex_src = self.tex_src[begin:end]
|
||
|
||
def filter_tex(self, bbl_file: Path) -> None:
|
||
"""Default TEX filterting proceedure."""
|
||
self.strip_tex()
|
||
self.preprocess()
|
||
self.replace_references(bbl_file)
|
||
self.replace_figures()
|
||
self.replace_tables()
|
||
self.replace_equations()
|
||
self.replace_sections()
|
||
self.postprocess()
|
||
|
||
|
||
for file in sys.argv[1:]:
|
||
print(LatexFile(file).tex_src)
|
||
|
||
# eof
|