package MKDoc::XML::Decode::XHTML; use warnings; use strict; # Portions (c) International Organization for Standardization 1986: # Permission to copy in any form is granted for use with conforming SGML # systems and applications as defined in ISO 8879, provided this notice is # included in all copies. our %ENTITY_2_CHAR = ( # Latin1 characters 'nbsp' => chr(160), 'iexcl' => chr(161), 'cent' => chr(162), 'pound' => chr(163), 'curren' => chr(164), 'yen' => chr(165), 'brvbar' => chr(166), 'sect' => chr(167), 'uml' => chr(168), 'copy' => chr(169), 'ordf' => chr(170), 'laquo' => chr(171), 'not' => chr(172), 'shy' => chr(173), 'reg' => chr(174), 'macr' => chr(175), 'deg' => chr(176), 'plusmn' => chr(177), 'sup2' => chr(178), 'sup3' => chr(179), 'acute' => chr(180), 'micro' => chr(181), 'para' => chr(182), 'middot' => chr(183), 'cedil' => chr(184), 'sup1' => chr(185), 'ordm' => chr(186), 'raquo' => chr(187), 'frac14' => chr(188), 'frac12' => chr(189), 'frac34' => chr(190), 'iquest' => chr(191), 'Agrave' => chr(192), 'Aacute' => chr(193), 'Acirc' => chr(194), 'Atilde' => chr(195), 'Auml' => chr(196), 'Aring' => chr(197), 'AElig' => chr(198), 'Ccedil' => chr(199), 'Egrave' => chr(200), 'Eacute' => chr(201), 'Ecirc' => chr(202), 'Euml' => chr(203), 'Igrave' => chr(204), 'Iacute' => chr(205), 'Icirc' => chr(206), 'Iuml' => chr(207), 'ETH' => chr(208), 'Ntilde' => chr(209), 'Ograve' => chr(210), 'Oacute' => chr(211), 'Ocirc' => chr(212), 'Otilde' => chr(213), 'Ouml' => chr(214), 'times' => chr(215), 'Oslash' => chr(216), 'Ugrave' => chr(217), 'Uacute' => chr(218), 'Ucirc' => chr(219), 'Uuml' => chr(220), 'Yacute' => chr(221), 'THORN' => chr(222), 'szlig' => chr(223), 'agrave' => chr(224), 'aacute' => chr(225), 'acirc' => chr(226), 'atilde' => chr(227), 'auml' => chr(228), 'aring' => chr(229), 'aelig' => chr(230), 'ccedil' => chr(231), 'egrave' => chr(232), 'eacute' => chr(233), 'ecirc' => chr(234), 'euml' => chr(235), 'igrave' => chr(236), 'iacute' => chr(237), 'icirc' => chr(238), 'iuml' => chr(239), 'eth' => chr(240), 'ntilde' => chr(241), 'ograve' => chr(242), 'oacute' => chr(243), 'ocirc' => chr(244), 'otilde' => chr(245), 'ouml' => chr(246), 'divide' => chr(247), 'oslash' => chr(248), 'ugrave' => chr(249), 'uacute' => chr(250), 'ucirc' => chr(251), 'uuml' => chr(252), 'yacute' => chr(253), 'thorn' => chr(254), 'yuml' => chr(255), # C0 Controls and Basic Latin # 'quot' => chr(34), # 'amp' => chr(38), # 'apos' => chr(39), # 'lt' => chr(60), # 'gt' => chr(62), # Latin Extended-A 'OElig' => chr(338), 'oelig' => chr(339), 'Scaron' => chr(352), 'scaron' => chr(353), 'Yuml' => chr(376), # Spacin g Modifier Letters 'circ' => chr(710), 'tilde' => chr(732), # General Punctuation # * lsaquo is proposed but not yet ISO standardized # * rsaquo is proposed but not yet ISO standardized 'ensp' => chr(8194), 'emsp' => chr(8195), 'thinsp' => chr(8201), 'zwnj' => chr(8204), 'zwj' => chr(8205), 'lrm' => chr(8206), 'rlm' => chr(8207), 'ndash' => chr(8211), 'mdash' => chr(8212), 'lsquo' => chr(8216), 'rsquo' => chr(8217), 'sbquo' => chr(8218), 'ldquo' => chr(8220), 'rdquo' => chr(8221), 'bdquo' => chr(8222), 'dagger' => chr(8224), 'Dagger' => chr(8225), 'permil' => chr(8240), 'lsaquo' => chr(8249), 'rsaquo' => chr(8250), 'euro' => chr(8364), # Mathematical, Greek and Symbolic characters for HTML # Latin Extended-B 'fnof' => chr(402), # Greek # * there is no Sigmaf, and no U+03A2 character either 'Alpha' => chr(913), 'Beta' => chr(914), 'Gamma' => chr(915), 'Delta' => chr(916), 'Epsilon' => chr(917), 'Zeta' => chr(918), 'Eta' => chr(919), 'Theta' => chr(920), 'Iota' => chr(921), 'Kappa' => chr(922), 'Lambda' => chr(923), 'Mu' => chr(924), 'Nu' => chr(925), 'Xi' => chr(926), 'Omicron' => chr(927), 'Pi' => chr(928), 'Rho' => chr(929), 'Sigma' => chr(931), 'Tau' => chr(932), 'Upsilon' => chr(933), 'Phi' => chr(934), 'Chi' => chr(935), 'Psi' => chr(936), 'Omega' => chr(937), 'alpha' => chr(945), 'beta' => chr(946), 'gamma' => chr(947), 'delta' => chr(948), 'epsilon' => chr(949), 'zeta' => chr(950), 'eta' => chr(951), 'theta' => chr(952), 'iota' => chr(953), 'kappa' => chr(954), 'lambda' => chr(955), 'mu' => chr(956), 'nu' => chr(957), 'xi' => chr(958), 'omicron' => chr(959), 'pi' => chr(960), 'rho' => chr(961), 'sigmaf' => chr(962), 'sigma' => chr(963), 'tau' => chr(964), 'upsilon' => chr(965), 'phi' => chr(966), 'chi' => chr(967), 'psi' => chr(968), 'omega' => chr(969), 'thetasym' => chr(977), 'upsih' => chr(978), 'piv' => chr(982), # General Punctuation # * bullet is NOT the same as bullet operator, U+2219 'bull' => chr(8226), 'hellip' => chr(8230), 'prime' => chr(8242), 'Prime' => chr(8243), 'oline' => chr(8254), 'frasl' => chr(8260), # Letterlike Symbols # * alef symbol is NOT the same as hebrew letter alef, U+05D0 although the same glyph could be used to depict both characters 'weierp' => chr(8472), 'image' => chr(8465), 'real' => chr(8476), 'trade' => chr(8482), 'alefsym' => chr(8501), # Arrows # * Unicode does not say that lArr is the same as the 'is implied by' arrow but also # does not have any other character for that function. So ? lArr can be used for 'is implied by' as ISOtech suggests # * Unicode does not say rArr is the 'implies' character but does not have another # character with this function so ? rArr can be used for 'implies' as ISOtech suggests 'larr' => chr(8592), 'uarr' => chr(8593), 'rarr' => chr(8594), 'darr' => chr(8595), 'harr' => chr(8596), 'crarr' => chr(8629), 'lArr' => chr(8656), 'uArr' => chr(8657), 'rArr' => chr(8658), 'dArr' => chr(8659), 'hArr' => chr(8660), # Mathematical Operators # * should there be a more memorable name than 'ni'? # * prod is NOT the same character as U+03A0 'greek capital letter pi' though the same glyph might be used for both # * sum is NOT the same character as U+03A3 'greek capital letter sigma' though the same glyph might be used for both # * sim: tilde operator is NOT the same character as the tilde, U+007E, although the same glyph might be used to represent both # * note that nsup, 'not a superset of, U+2283' is not covered by the Symbol font encoding and is not included. # Should it be, for symmetry? It is in ISOamsn # * sdot: dot operator is NOT the same character as U+00B7 middle dot 'forall' => chr(8704), 'part' => chr(8706), 'exist' => chr(8707), 'empty' => chr(8709), 'nabla' => chr(8711), 'isin' => chr(8712), 'notin' => chr(8713), 'ni' => chr(8715), 'prod' => chr(8719), 'sum' => chr(8721), 'minus' => chr(8722), 'lowast' => chr(8727), 'radic' => chr(8730), 'prop' => chr(8733), 'infin' => chr(8734), 'ang' => chr(8736), 'and' => chr(8743), 'or' => chr(8744), 'cap' => chr(8745), 'cup' => chr(8746), 'int' => chr(8747), 'there4' => chr(8756), 'sim' => chr(8764), 'cong' => chr(8773), 'asymp' => chr(8776), 'ne' => chr(8800), 'equiv' => chr(8801), 'le' => chr(8804), 'ge' => chr(8805), 'sub' => chr(8834), 'sup' => chr(8835), 'nsub' => chr(8836), 'sube' => chr(8838), 'supe' => chr(8839), 'oplus' => chr(8853), 'otimes' => chr(8855), 'perp' => chr(8869), 'sdot' => chr(8901), # Miscellaneous Technical # * lang is NOT the same character as U+003C 'less than' or U+2039 'single left-pointing angle quotation mark' # * rang is NOT the same character as U+003E 'greater than' or U+203A 'single right-pointing angle quotation mark' 'lceil' => chr(8968), 'rceil' => chr(8969), 'lfloor' => chr(8970), 'rfloor' => chr(8971), 'lang' => chr(9001), 'rang' => chr(9002), # Geometric Shapes 'loz' => chr(9674), # Miscellaneous Symbols # * black here seems to mean filled as opposed to hollow 'spades' => chr(9824), 'clubs' => chr(9827), 'hearts' => chr(9829), 'diams' => chr(9830), ); sub process { (@_ == 2) or warn "MKDoc::XML::Encode::process() should be called with two arguments"; my $class = shift; my $stuff = shift; return $ENTITY_2_CHAR{$stuff}; } 1;