You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
458 lines
16 KiB
458 lines
16 KiB
""" |
|
Python Markdown |
|
|
|
A Python implementation of John Gruber's Markdown. |
|
|
|
Documentation: https://python-markdown.github.io/ |
|
GitHub: https://github.com/Python-Markdown/markdown/ |
|
PyPI: https://pypi.org/project/Markdown/ |
|
|
|
Started by Manfred Stienstra (http://www.dwerg.net/). |
|
Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org). |
|
Currently maintained by Waylan Limberg (https://github.com/waylan), |
|
Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser). |
|
|
|
Copyright 2007-2018 The Python Markdown Project (v. 1.7 and later) |
|
Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b) |
|
Copyright 2004 Manfred Stienstra (the original version) |
|
|
|
License: BSD (see LICENSE.md for details). |
|
""" |
|
|
|
import re |
|
import xml.etree.ElementTree as etree |
|
from . import util |
|
from . import inlinepatterns |
|
|
|
|
|
def build_treeprocessors(md, **kwargs): |
|
""" Build the default `treeprocessors` for Markdown. """ |
|
treeprocessors = util.Registry() |
|
treeprocessors.register(InlineProcessor(md), 'inline', 20) |
|
treeprocessors.register(PrettifyTreeprocessor(md), 'prettify', 10) |
|
treeprocessors.register(UnescapeTreeprocessor(md), 'unescape', 0) |
|
return treeprocessors |
|
|
|
|
|
def isString(s): |
|
""" Check if it's string """ |
|
if not isinstance(s, util.AtomicString): |
|
return isinstance(s, str) |
|
return False |
|
|
|
|
|
class Treeprocessor(util.Processor): |
|
""" |
|
`Treeprocessor`s are run on the `ElementTree` object before serialization. |
|
|
|
Each `Treeprocessor` implements a `run` method that takes a pointer to an |
|
`ElementTree`, modifies it as necessary and returns an `ElementTree` |
|
object. |
|
|
|
`Treeprocessors` must extend `markdown.Treeprocessor`. |
|
|
|
""" |
|
def run(self, root): |
|
""" |
|
Subclasses of `Treeprocessor` should implement a `run` method, which |
|
takes a root `ElementTree`. This method can return another `ElementTree` |
|
object, and the existing root `ElementTree `will be replaced, or it can |
|
modify the current tree and return `None`. |
|
""" |
|
pass # pragma: no cover |
|
|
|
|
|
class InlineProcessor(Treeprocessor): |
|
""" |
|
A `Treeprocessor` that traverses a tree, applying inline patterns. |
|
""" |
|
|
|
def __init__(self, md): |
|
self.__placeholder_prefix = util.INLINE_PLACEHOLDER_PREFIX |
|
self.__placeholder_suffix = util.ETX |
|
self.__placeholder_length = 4 + len(self.__placeholder_prefix) \ |
|
+ len(self.__placeholder_suffix) |
|
self.__placeholder_re = util.INLINE_PLACEHOLDER_RE |
|
self.md = md |
|
self.inlinePatterns = md.inlinePatterns |
|
self.ancestors = [] |
|
|
|
def __makePlaceholder(self, type): |
|
""" Generate a placeholder """ |
|
id = "%04d" % len(self.stashed_nodes) |
|
hash = util.INLINE_PLACEHOLDER % id |
|
return hash, id |
|
|
|
def __findPlaceholder(self, data, index): |
|
""" |
|
Extract id from data string, start from index |
|
|
|
Keyword arguments: |
|
|
|
* `data`: string |
|
* `index`: index, from which we start search |
|
|
|
Returns: placeholder id and string index, after the found placeholder. |
|
|
|
""" |
|
m = self.__placeholder_re.search(data, index) |
|
if m: |
|
return m.group(1), m.end() |
|
else: |
|
return None, index + 1 |
|
|
|
def __stashNode(self, node, type): |
|
""" Add node to stash """ |
|
placeholder, id = self.__makePlaceholder(type) |
|
self.stashed_nodes[id] = node |
|
return placeholder |
|
|
|
def __handleInline(self, data, patternIndex=0): |
|
""" |
|
Process string with inline patterns and replace it |
|
with placeholders |
|
|
|
Keyword arguments: |
|
|
|
* `data`: A line of Markdown text |
|
* `patternIndex`: The index of the `inlinePattern` to start with |
|
|
|
Returns: String with placeholders. |
|
|
|
""" |
|
if not isinstance(data, util.AtomicString): |
|
startIndex = 0 |
|
count = len(self.inlinePatterns) |
|
while patternIndex < count: |
|
data, matched, startIndex = self.__applyPattern( |
|
self.inlinePatterns[patternIndex], data, patternIndex, startIndex |
|
) |
|
if not matched: |
|
patternIndex += 1 |
|
return data |
|
|
|
def __processElementText(self, node, subnode, isText=True): |
|
""" |
|
Process placeholders in `Element.text` or `Element.tail` |
|
of Elements popped from `self.stashed_nodes`. |
|
|
|
Keywords arguments: |
|
|
|
* `node`: parent node |
|
* `subnode`: processing node |
|
* `isText`: boolean variable, True - it's text, False - it's a tail |
|
|
|
Returns: None |
|
|
|
""" |
|
if isText: |
|
text = subnode.text |
|
subnode.text = None |
|
else: |
|
text = subnode.tail |
|
subnode.tail = None |
|
|
|
childResult = self.__processPlaceholders(text, subnode, isText) |
|
|
|
if not isText and node is not subnode: |
|
pos = list(node).index(subnode) + 1 |
|
else: |
|
pos = 0 |
|
|
|
childResult.reverse() |
|
for newChild in childResult: |
|
node.insert(pos, newChild[0]) |
|
|
|
def __processPlaceholders(self, data, parent, isText=True): |
|
""" |
|
Process string with placeholders and generate `ElementTree` tree. |
|
|
|
Keyword arguments: |
|
|
|
* `data`: string with placeholders instead of `ElementTree` elements. |
|
* `parent`: Element, which contains processing inline data |
|
|
|
Returns: list with `ElementTree` elements with applied inline patterns. |
|
|
|
""" |
|
def linkText(text): |
|
if text: |
|
if result: |
|
if result[-1][0].tail: |
|
result[-1][0].tail += text |
|
else: |
|
result[-1][0].tail = text |
|
elif not isText: |
|
if parent.tail: |
|
parent.tail += text |
|
else: |
|
parent.tail = text |
|
else: |
|
if parent.text: |
|
parent.text += text |
|
else: |
|
parent.text = text |
|
result = [] |
|
strartIndex = 0 |
|
while data: |
|
index = data.find(self.__placeholder_prefix, strartIndex) |
|
if index != -1: |
|
id, phEndIndex = self.__findPlaceholder(data, index) |
|
|
|
if id in self.stashed_nodes: |
|
node = self.stashed_nodes.get(id) |
|
|
|
if index > 0: |
|
text = data[strartIndex:index] |
|
linkText(text) |
|
|
|
if not isString(node): # it's Element |
|
for child in [node] + list(node): |
|
if child.tail: |
|
if child.tail.strip(): |
|
self.__processElementText( |
|
node, child, False |
|
) |
|
if child.text: |
|
if child.text.strip(): |
|
self.__processElementText(child, child) |
|
else: # it's just a string |
|
linkText(node) |
|
strartIndex = phEndIndex |
|
continue |
|
|
|
strartIndex = phEndIndex |
|
result.append((node, self.ancestors[:])) |
|
|
|
else: # wrong placeholder |
|
end = index + len(self.__placeholder_prefix) |
|
linkText(data[strartIndex:end]) |
|
strartIndex = end |
|
else: |
|
text = data[strartIndex:] |
|
if isinstance(data, util.AtomicString): |
|
# We don't want to loose the `AtomicString` |
|
text = util.AtomicString(text) |
|
linkText(text) |
|
data = "" |
|
|
|
return result |
|
|
|
def __applyPattern(self, pattern, data, patternIndex, startIndex=0): |
|
""" |
|
Check if the line fits the pattern, create the necessary |
|
elements, add it to `stashed_nodes`. |
|
|
|
Keyword arguments: |
|
|
|
* `data`: the text to be processed |
|
* `pattern`: the pattern to be checked |
|
* `patternIndex`: index of current pattern |
|
* `startIndex`: string index, from which we start searching |
|
|
|
Returns: String with placeholders instead of `ElementTree` elements. |
|
|
|
""" |
|
new_style = isinstance(pattern, inlinepatterns.InlineProcessor) |
|
|
|
for exclude in pattern.ANCESTOR_EXCLUDES: |
|
if exclude.lower() in self.ancestors: |
|
return data, False, 0 |
|
|
|
if new_style: |
|
match = None |
|
# Since `handleMatch` may reject our first match, |
|
# we iterate over the buffer looking for matches |
|
# until we can't find any more. |
|
for match in pattern.getCompiledRegExp().finditer(data, startIndex): |
|
node, start, end = pattern.handleMatch(match, data) |
|
if start is None or end is None: |
|
startIndex += match.end(0) |
|
match = None |
|
continue |
|
break |
|
else: # pragma: no cover |
|
match = pattern.getCompiledRegExp().match(data[startIndex:]) |
|
leftData = data[:startIndex] |
|
|
|
if not match: |
|
return data, False, 0 |
|
|
|
if not new_style: # pragma: no cover |
|
node = pattern.handleMatch(match) |
|
start = match.start(0) |
|
end = match.end(0) |
|
|
|
if node is None: |
|
return data, True, end |
|
|
|
if not isString(node): |
|
if not isinstance(node.text, util.AtomicString): |
|
# We need to process current node too |
|
for child in [node] + list(node): |
|
if not isString(node): |
|
if child.text: |
|
self.ancestors.append(child.tag.lower()) |
|
child.text = self.__handleInline( |
|
child.text, patternIndex + 1 |
|
) |
|
self.ancestors.pop() |
|
if child.tail: |
|
child.tail = self.__handleInline( |
|
child.tail, patternIndex |
|
) |
|
|
|
placeholder = self.__stashNode(node, pattern.type()) |
|
|
|
if new_style: |
|
return "{}{}{}".format(data[:start], |
|
placeholder, data[end:]), True, 0 |
|
else: # pragma: no cover |
|
return "{}{}{}{}".format(leftData, |
|
match.group(1), |
|
placeholder, match.groups()[-1]), True, 0 |
|
|
|
def __build_ancestors(self, parent, parents): |
|
"""Build the ancestor list.""" |
|
ancestors = [] |
|
while parent is not None: |
|
if parent is not None: |
|
ancestors.append(parent.tag.lower()) |
|
parent = self.parent_map.get(parent) |
|
ancestors.reverse() |
|
parents.extend(ancestors) |
|
|
|
def run(self, tree, ancestors=None): |
|
"""Apply inline patterns to a parsed Markdown tree. |
|
|
|
Iterate over `ElementTree`, find elements with inline tag, apply inline |
|
patterns and append newly created Elements to tree. If you don't |
|
want to process your data with inline patterns, instead of normal |
|
string, use subclass `AtomicString`: |
|
|
|
node.text = markdown.AtomicString("This will not be processed.") |
|
|
|
Arguments: |
|
|
|
* `tree`: `ElementTree` object, representing Markdown tree. |
|
* `ancestors`: List of parent tag names that precede the tree node (if needed). |
|
|
|
Returns: `ElementTree` object with applied inline patterns. |
|
|
|
""" |
|
self.stashed_nodes = {} |
|
|
|
# Ensure a valid parent list, but copy passed in lists |
|
# to ensure we don't have the user accidentally change it on us. |
|
tree_parents = [] if ancestors is None else ancestors[:] |
|
|
|
self.parent_map = {c: p for p in tree.iter() for c in p} |
|
stack = [(tree, tree_parents)] |
|
|
|
while stack: |
|
currElement, parents = stack.pop() |
|
|
|
self.ancestors = parents |
|
self.__build_ancestors(currElement, self.ancestors) |
|
|
|
insertQueue = [] |
|
for child in currElement: |
|
if child.text and not isinstance( |
|
child.text, util.AtomicString |
|
): |
|
self.ancestors.append(child.tag.lower()) |
|
text = child.text |
|
child.text = None |
|
lst = self.__processPlaceholders( |
|
self.__handleInline(text), child |
|
) |
|
for item in lst: |
|
self.parent_map[item[0]] = child |
|
stack += lst |
|
insertQueue.append((child, lst)) |
|
self.ancestors.pop() |
|
if child.tail: |
|
tail = self.__handleInline(child.tail) |
|
dumby = etree.Element('d') |
|
child.tail = None |
|
tailResult = self.__processPlaceholders(tail, dumby, False) |
|
if dumby.tail: |
|
child.tail = dumby.tail |
|
pos = list(currElement).index(child) + 1 |
|
tailResult.reverse() |
|
for newChild in tailResult: |
|
self.parent_map[newChild[0]] = currElement |
|
currElement.insert(pos, newChild[0]) |
|
if len(child): |
|
self.parent_map[child] = currElement |
|
stack.append((child, self.ancestors[:])) |
|
|
|
for element, lst in insertQueue: |
|
for i, obj in enumerate(lst): |
|
newChild = obj[0] |
|
element.insert(i, newChild) |
|
return tree |
|
|
|
|
|
class PrettifyTreeprocessor(Treeprocessor): |
|
""" Add line breaks to the html document. """ |
|
|
|
def _prettifyETree(self, elem): |
|
""" Recursively add line breaks to `ElementTree` children. """ |
|
|
|
i = "\n" |
|
if self.md.is_block_level(elem.tag) and elem.tag not in ['code', 'pre']: |
|
if (not elem.text or not elem.text.strip()) \ |
|
and len(elem) and self.md.is_block_level(elem[0].tag): |
|
elem.text = i |
|
for e in elem: |
|
if self.md.is_block_level(e.tag): |
|
self._prettifyETree(e) |
|
if not elem.tail or not elem.tail.strip(): |
|
elem.tail = i |
|
|
|
def run(self, root): |
|
""" Add line breaks to `ElementTree` root object. """ |
|
|
|
self._prettifyETree(root) |
|
# Do `<br />`'s separately as they are often in the middle of |
|
# inline content and missed by `_prettifyETree`. |
|
brs = root.iter('br') |
|
for br in brs: |
|
if not br.tail or not br.tail.strip(): |
|
br.tail = '\n' |
|
else: |
|
br.tail = '\n%s' % br.tail |
|
# Clean up extra empty lines at end of code blocks. |
|
pres = root.iter('pre') |
|
for pre in pres: |
|
if len(pre) and pre[0].tag == 'code': |
|
code = pre[0] |
|
# Only prettify code containing text only |
|
if not len(code) and code.text is not None: |
|
code.text = util.AtomicString(code.text.rstrip() + '\n') |
|
|
|
|
|
class UnescapeTreeprocessor(Treeprocessor): |
|
""" Restore escaped chars """ |
|
|
|
RE = re.compile(r'{}(\d+){}'.format(util.STX, util.ETX)) |
|
|
|
def _unescape(self, m): |
|
return chr(int(m.group(1))) |
|
|
|
def unescape(self, text): |
|
return self.RE.sub(self._unescape, text) |
|
|
|
def run(self, root): |
|
""" Loop over all elements and unescape all text. """ |
|
for elem in root.iter(): |
|
# Unescape text content |
|
if elem.text and not elem.tag == 'code': |
|
elem.text = self.unescape(elem.text) |
|
# Unescape tail content |
|
if elem.tail: |
|
elem.tail = self.unescape(elem.tail) |
|
# Unescape attribute values |
|
for key, value in elem.items(): |
|
elem.set(key, self.unescape(value))
|
|
|