Pastebin

Hilfe zum html-modul in python

von Karolus
Beschreibung:
http://de.openoffice.info/viewtopic.php?f=18&t=67362

Dein Code:
  1. Type:        module
  2. String form: <module 'html' from '/usr/lib/python3.5/html/__init__.py'>
  3. File:        /usr/lib/python3.5/html/__init__.py
  4. Source:    
  5. """
  6. General functions for HTML manipulation.
  7. """
  8.  
  9. import re as _re
  10. from html.entities import html5 as _html5
  11.  
  12.  
  13. __all__ = ['escape', 'unescape']
  14.  
  15.  
  16. def escape(s, quote=True):
  17.     """
  18.    Replace special characters "&", "<" and ">" to HTML-safe sequences.
  19.    If the optional flag quote is true (the default), the quotation mark
  20.    characters, both double quote (") and single quote (') characters are also
  21.    translated.
  22.    """
  23.     s = s.replace("&", "&amp;") # Must be done first!
  24.     s = s.replace("<", "&lt;")
  25.     s = s.replace(">", "&gt;")
  26.     if quote:
  27.         s = s.replace('"', "&quot;")
  28.         s = s.replace('\'', "&#x27;")
  29.     return s
  30.  
  31.  
  32. # see http://www.w3.org/TR/html5/syntax.html#tokenizing-character-references
  33.  
  34. _invalid_charrefs = {
  35.     0x00: '\ufffd',  # REPLACEMENT CHARACTER
  36.     0x0d: '\r',      # CARRIAGE RETURN
  37.     0x80: '\u20ac',  # EURO SIGN
  38.     0x81: '\x81',    # <control>
  39.     0x82: '\u201a',  # SINGLE LOW-9 QUOTATION MARK
  40.     0x83: '\u0192',  # LATIN SMALL LETTER F WITH HOOK
  41.     0x84: '\u201e',  # DOUBLE LOW-9 QUOTATION MARK
  42.     0x85: '\u2026',  # HORIZONTAL ELLIPSIS
  43.     0x86: '\u2020',  # DAGGER
  44.     0x87: '\u2021',  # DOUBLE DAGGER
  45.     0x88: '\u02c6',  # MODIFIER LETTER CIRCUMFLEX ACCENT
  46.     0x89: '\u2030',  # PER MILLE SIGN
  47.     0x8a: '\u0160',  # LATIN CAPITAL LETTER S WITH CARON
  48.     0x8b: '\u2039',  # SINGLE LEFT-POINTING ANGLE QUOTATION MARK
  49.     0x8c: '\u0152',  # LATIN CAPITAL LIGATURE OE
  50.     0x8d: '\x8d',    # <control>
  51.     0x8e: '\u017d',  # LATIN CAPITAL LETTER Z WITH CARON
  52.     0x8f: '\x8f',    # <control>
  53.     0x90: '\x90',    # <control>
  54.     0x91: '\u2018',  # LEFT SINGLE QUOTATION MARK
  55.     0x92: '\u2019',  # RIGHT SINGLE QUOTATION MARK
  56.     0x93: '\u201c',  # LEFT DOUBLE QUOTATION MARK
  57.     0x94: '\u201d',  # RIGHT DOUBLE QUOTATION MARK
  58.     0x95: '\u2022',  # BULLET
  59.     0x96: '\u2013',  # EN DASH
  60.     0x97: '\u2014',  # EM DASH
  61.     0x98: '\u02dc',  # SMALL TILDE
  62.     0x99: '\u2122',  # TRADE MARK SIGN
  63.     0x9a: '\u0161',  # LATIN SMALL LETTER S WITH CARON
  64.     0x9b: '\u203a',  # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
  65.     0x9c: '\u0153',  # LATIN SMALL LIGATURE OE
  66.     0x9d: '\x9d',    # <control>
  67.     0x9e: '\u017e',  # LATIN SMALL LETTER Z WITH CARON
  68.     0x9f: '\u0178',  # LATIN CAPITAL LETTER Y WITH DIAERESIS
  69. }
  70.  
  71. _invalid_codepoints = {
  72.     # 0x0001 to 0x0008
  73.     0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
  74.     # 0x000E to 0x001F
  75.     0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
  76.     0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f,
  77.     # 0x007F to 0x009F
  78.     0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a,
  79.     0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
  80.     0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
  81.     # 0xFDD0 to 0xFDEF
  82.     0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8,
  83.     0xfdd9, 0xfdda, 0xfddb, 0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1,
  84.     0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7, 0xfde8, 0xfde9, 0xfdea,
  85.     0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef,
  86.     # others
  87.     0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff,
  88.     0x4fffe, 0x4ffff, 0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff,
  89.     0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff, 0xafffe, 0xaffff, 0xbfffe, 0xbffff,
  90.     0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff, 0xffffe, 0xfffff,
  91.     0x10fffe, 0x10ffff
  92. }
  93.  
  94.  
  95. def _replace_charref(s):
  96.     s = s.group(1)
  97.     if s[0] == '#':
  98.         # numeric charref
  99.         if s[1] in 'xX':
  100.             num = int(s[2:].rstrip(';'), 16)
  101.         else:
  102.             num = int(s[1:].rstrip(';'))
  103.         if num in _invalid_charrefs:
  104.             return _invalid_charrefs[num]
  105.         if 0xD800 <= num <= 0xDFFF or num > 0x10FFFF:
  106.             return '\uFFFD'
  107.         if num in _invalid_codepoints:
  108.             return ''
  109.         return chr(num)
  110.     else:
  111.         # named charref
  112.         if s in _html5:
  113.             return _html5[s]
  114.         # find the longest matching name (as defined by the standard)
  115.         for x in range(len(s)-1, 1, -1):
  116.             if s[:x] in _html5:
  117.                 return _html5[s[:x]] + s[x:]
  118.         else:
  119.             return '&' + s
  120.  
  121.  
  122. _charref = _re.compile(r'&(#[0-9]+;?'
  123.                        r'|#[xX][0-9a-fA-F]+;?'
  124.                        r'|[^\t\n\f <&#;]{1,32};?)')
  125.  
  126. def unescape(s):
  127.     """
  128.    Convert all named and numeric character references (e.g. &gt;, &#62;,
  129.    &x3e;) in the string s to the corresponding unicode characters.
  130.    This function uses the rules defined by the HTML 5 standard
  131.    for both valid and invalid character references, and the list of
  132.    HTML 5 named character references defined in html.entities.html5.
  133.    """
  134.     if '&' not in s:
  135.         return s
  136.     return _charref.sub(_replace_charref, s)

Quellcode

Hier kannst du den Code kopieren und ihn in deinen bevorzugten Editor einfügen. Alternativ kannst du den gesamten Eintrag auch als Datei herunterladen.