#!/usr/local/bin/perl # refcl - reference cleaner # Replaces HTML character entity references with New-JIS (JIS X 0213) # characters. e.g., € is replaced with a euro sign. EUC only. # Special notation: &jis-[men]-[ku]-[ten]; # stands for JIS X 0213 code point. # e.g., &jis-1-6-57; is replaced with a "final sigma". # This script is based on the New-JIS code table X0213-p.csv # provided by JCS/WG2 and W3C's HTML 4.0 spec. # Feel free to copy and modify. # No warranty. %map = ( nbsp => a9a2, iexcl => a9a3, curren => a9a4, brvbar => a9a5, copy => a9a6, ordf => a9a7, laquo => a9a8, shy => a9a9, reg => a9aa, macr => a9ab, sup2 => a9ac, sup3 => a9ad, middot => a9ae, cedil => a9af, sup1 => a9b0, ordm => a9b1, raquo => a9b2, frac14 => a9b3, frac12 => a9b4, frac34 => a9b5, iquest => a9b6, Agrave => a9b7, Aacute => a9b8, Acirc => a9b9, Atilde => a9ba, Auml => a9bb, Aring => a9bc, AElig => a9bd, Ccedil => a9be, Egrave => a9bf, Eacute => a9c0, Ecirc => a9c1, Euml => a9c2, Igrave => a9c3, Iacute => a9c4, Icirc => a9c5, Iuml => a9c6, ETH => a9c7, Ntilde => a9c8, Ograve => a9c9, Oacute => a9ca, Ocirc => a9cb, Otilde => a9cc, Ouml => a9cd, Oslash => a9ce, Ugrave => a9cf, Uacute => a9d0, Ucirc => a9d1, Uuml => a9d2, Yacute => a9d3, THORN => a9d4, szlig => a9d5, agrave => a9d6, aacute => a9d7, acirc => a9d8, atilde => a9d9, auml => a9da, aring => a9db, aelig => a9dc, ccedil => a9dd, egrave => a9de, eacute => a9df, ecirc => a9e0, euml => a9e1, igrave => a9e2, iacute => a9e3, icirc => a9e4, iuml => a9e5, eth => a9e6, ntilde => a9e7, ograve => a9e8, oacute => a9e9, ocirc => a9ea, otilde => a9eb, ouml => a9ec, oslash => a9ed, ugrave => a9ee, uacute => a9ef, ucirc => a9f0, uuml => a9f1, yacute => a9f2, thorn => a9f3, yuml => a9f4, OElig => abab, oelig => abaa, Scaron => aaa6, scaron => aab2, ndash => a3fc, euro => a9a1, sigmaf => a6d9, bull => a3c0, alefsym => a3dc, harr => a2f1, empty => a2c7, notin => a2c6, cong => a2ed, asymp => a2ee, nsub => a2c2, oplus => a2d1, otimes => a2d3, spades => a6ba, clubs => a6c0, hearts => a6be, diams => a6bc ); while(<>) { while (/\&([a-zA-Z0-9-.]+);/) { print $`; $matched = $&; $_ = $'; if ($matched =~ /jis\-([12])\-(\d\d?)\-(\d\d?)/) { # special notation $euc = pack "CC", 160 + $2, 160 + $3; $matched = (($1 == 2) ? (pack H2, "8f") : "") . $euc; } else { $euc = $map{"$1"}; if ($euc ne "") { $subst = pack H4, $euc; $matched =~ s/\&([a-zA-Z0-9-.]+);/$subst/; } } print $matched; } print $_; }