diff -pruN /temp/dillo/src/html.c dillo-char-representation/src/html.c
--- /temp/dillo/src/html.c 2003-05-05 14:44:42.000000000 +0700
+++ dillo-char-representation/src/html.c 2003-05-13 15:03:27.000000000 +0700
@@ -620,73 +620,76 @@ static void Html_stash_init(DilloHtml *h
typedef struct {
char *entity;
int isocode;
+ char *represent;
} Ent_t;
#define NumEnt 252
+#define MaxRepresent 10
+
static const Ent_t Entities[NumEnt] = {
- {"AElig",0306}, {"Aacute",0301}, {"Acirc",0302}, {"Agrave",0300},
- {"Alpha",01621},{"Aring",0305}, {"Atilde",0303}, {"Auml",0304},
- {"Beta",01622}, {"Ccedil",0307}, {"Chi",01647}, {"Dagger",020041},
- {"Delta",01624},{"ETH",0320}, {"Eacute",0311}, {"Ecirc",0312},
- {"Egrave",0310},{"Epsilon",01625},{"Eta",01627}, {"Euml",0313},
- {"Gamma",01623},{"Iacute",0315}, {"Icirc",0316}, {"Igrave",0314},
- {"Iota",01631}, {"Iuml",0317}, {"Kappa",01632}, {"Lambda",01633},
- {"Mu",01634}, {"Ntilde",0321}, {"Nu",01635}, {"OElig",0522},
- {"Oacute",0323},{"Ocirc",0324}, {"Ograve",0322}, {"Omega",01651},
- {"Omicron",01637},{"Oslash",0330},{"Otilde",0325},{"Ouml",0326},
- {"Phi",01646}, {"Pi",01640}, {"Prime",020063},{"Psi",01650},
- {"Rho",01641}, {"Scaron",0540}, {"Sigma",01643}, {"THORN",0336},
- {"Tau",01644}, {"Theta",01630}, {"Uacute",0332}, {"Ucirc",0333},
- {"Ugrave",0331},{"Upsilon",01645},{"Uuml",0334}, {"Xi",01636},
- {"Yacute",0335},{"Yuml",0570}, {"Zeta",01626}, {"aacute",0341},
- {"acirc",0342}, {"acute",0264}, {"aelig",0346}, {"agrave",0340},
- {"alefsym",020465},{"alpha",01661},{"amp",38}, {"and",021047},
- {"ang",021040}, {"aring",0345}, {"asymp",021110},{"atilde",0343},
- {"auml",0344}, {"bdquo",020036},{"beta",01662}, {"brvbar",0246},
- {"bull",020042},{"cap",021051}, {"ccedil",0347}, {"cedil",0270},
- {"cent",0242}, {"chi",01707}, {"circ",01306}, {"clubs",023143},
- {"cong",021105},{"copy",0251}, {"crarr",020665},{"cup",021052},
- {"curren",0244},{"dArr",020723}, {"dagger",020040},{"darr",020623},
- {"deg",0260}, {"delta",01664}, {"diams",023146},{"divide",0367},
- {"eacute",0351},{"ecirc",0352}, {"egrave",0350}, {"empty",021005},
- {"emsp",020003},{"ensp",020002}, {"epsilon",01665},{"equiv",021141},
- {"eta",01667}, {"eth",0360}, {"euml",0353}, {"euro",020254},
- {"exist",021003},{"fnof",0622}, {"forall",021000},{"frac12",0275},
- {"frac14",0274},{"frac34",0276}, {"frasl",020104},{"gamma",01663},
- {"ge",021145}, {"gt",62}, {"hArr",020724}, {"harr",020624},
- {"hearts",023145},{"hellip",020046},{"iacute",0355},{"icirc",0356},
- {"iexcl",0241}, {"igrave",0354}, {"image",020421},{"infin",021036},
- {"int",021053}, {"iota",01671}, {"iquest",0277}, {"isin",021010},
- {"iuml",0357}, {"kappa",01672}, {"lArr",020720}, {"lambda",01673},
- {"lang",021451},{"laquo",0253}, {"larr",020620}, {"lceil",021410},
- {"ldquo",020034},{"le",021144}, {"lfloor",021412},{"lowast",021027},
- {"loz",022712}, {"lrm",020016}, {"lsaquo",020071},{"lsquo",020030},
- {"lt",60}, {"macr",0257}, {"mdash",020024},{"micro",0265},
- {"middot",0267},{"minus",021022},{"mu",01674}, {"nabla",021007},
- {"nbsp",32}, {"ndash",020023},{"ne",021140}, {"ni",021013},
- {"not",0254}, {"notin",021011},{"nsub",021204}, {"ntilde",0361},
- {"nu",01675}, {"oacute",0363}, {"ocirc",0364}, {"oelig",0523},
- {"ograve",0362},{"oline",020076},{"omega",01711}, {"omicron",01677},
- {"oplus",021225},{"or",021050}, {"ordf",0252}, {"ordm",0272},
- {"oslash",0370},{"otilde",0365}, {"otimes",021227},{"ouml",0366},
- {"para",0266}, {"part",021002}, {"permil",020060},{"perp",021245},
- {"phi",01706}, {"pi",01700}, {"piv",01726}, {"plusmn",0261},
- {"pound",0243}, {"prime",020062},{"prod",021017}, {"prop",021035},
- {"psi",01710}, {"quot",34}, {"rArr",020722}, {"radic",021032},
- {"rang",021452},{"raquo",0273}, {"rarr",020622}, {"rceil",021411},
- {"rdquo",020035},{"real",020434},{"reg",0256}, {"rfloor",021413},
- {"rho",01701}, {"rlm",020017}, {"rsaquo",020072},{"rsquo",020031},
- {"sbquo",020032},{"scaron",0541},{"sdot",021305}, {"sect",0247},
- {"shy",0255}, {"sigma",01703}, {"sigmaf",01702},{"sim",021074},
- {"spades",023140},{"sub",021202},{"sube",021206}, {"sum",021021},
- {"sup",021203}, {"sup1",0271}, {"sup2",0262}, {"sup3",0263},
- {"supe",021207},{"szlig",0337}, {"tau",01704}, {"there4",021064},
- {"theta",01670},{"thetasym",01721},{"thinsp",020011},{"thorn",0376},
- {"tilde",01334},{"times",0327}, {"trade",020442},{"uArr",020721},
- {"uacute",0372},{"uarr",020621}, {"ucirc",0373}, {"ugrave",0371},
- {"uml",0250}, {"upsih",01722}, {"upsilon",01705},{"uuml",0374},
- {"weierp",020430},{"xi",01676}, {"yacute",0375}, {"yen",0245},
- {"yuml",0377}, {"zeta",01666}, {"zwj",020015}, {"zwnj",020014}
+ {"AElig", 0x00c6,"AE"}, {"Aacute", 0x00c1,"A"}, {"Acirc", 0x00c2,"A"}, {"Agrave",0x00c0,"A"},
+ {"Alpha", 0x0391,"A"}, {"Aring", 0x00c5,"A"}, {"Atilde",0x00c3,"A"}, {"Auml", 0x00c4,"Ae"},
+ {"Beta", 0x0392,"B"}, {"Ccedil", 0x00c7,"C,"}, {"Chi", 0x03a7,"X"}, {"Dagger",0x2021,"/="},
+ {"Delta", 0x0394,"D"}, {"ETH", 0x00d0,"D-"}, {"Eacute",0x00c9,"E"}, {"Ecirc", 0x00ca,"E"},
+ {"Egrave", 0x00c8,"E"}, {"Epsilon",0x0395,"E"}, {"Eta", 0x0397,"Y"}, {"Euml", 0x00cb,"E"},
+ {"Gamma", 0x0393,"G"}, {"Iacute", 0x00cd,"I"}, {"Icirc", 0x00ce,"I"}, {"Igrave",0x00cc,"I"},
+ {"Iota", 0x0399,"I"}, {"Iuml", 0x00cf,"I"}, {"Kappa", 0x039a,"K"}, {"Lambda",0x039b,"L"},
+ {"Mu", 0x039c,"M"}, {"Ntilde", 0x00d1,"N"}, {"Nu", 0x039d,"N"}, {"OElig", 0x0152,"OE"},
+ {"Oacute", 0x00d3,"O"}, {"Ocirc", 0x00d4,"O"}, {"Ograve",0x00d2,"O"}, {"Omega", 0x03a9,"W*"},
+ {"Omicron",0x039f,"O"}, {"Oslash", 0x00d8,"O/"}, {"Otilde",0x00d5,"O"}, {"Ouml", 0x00d6,"O"},
+ {"Phi", 0x03a6,"F"}, {"Pi", 0x03a0,"P"}, {"Prime", 0x2033,"''"}, {"Psi", 0x03a8,"Q"},
+ {"Rho", 0x03a1,"R"}, {"Scaron", 0x0160,"S"}, {"Sigma", 0x03a3,"S"}, {"THORN", 0x00de,"TH"},
+ {"Tau", 0x03a4,"T"}, {"Theta", 0x0398,"TH"}, {"Uacute",0x00da,"U"}, {"Ucirc", 0x00db,"U"},
+ {"Ugrave", 0x00d9,"U"}, {"Upsilon",0x03a5,"U"}, {"Uuml", 0x00dc,"U"}, {"Xi", 0x039e,"X"},
+ {"Yacute", 0x00dd,"Y"}, {"Yuml", 0x0178,"Y"}, {"Zeta", 0x0396,"Z"}, {"aacute",0x00e1,"a"},
+ {"acirc", 0x00e2,"a"}, {"acute", 0x00b4,"'"}, {"aelig", 0x00e6,"ae"}, {"agrave",0x00e0,"`a"},
+ {"alefsym",0x2135,"Aleph"},{"alpha",0x03b1,"a"}, {"amp", 0x0026,"&"}, {"and", 0x2227,"AND"},
+ {"ang", 0x2220,"-V"}, {"aring", 0x00e5,"aa"}, {"asymp", 0x2248,"~="}, {"atilde",0x00e3,"a"},
+ {"auml", 0x00e4,"a"}, {"bdquo", 0x201e,"\""}, {"beta", 0x03b2,"b"}, {"brvbar",0x00a6,"|"},
+ {"bull", 0x2022,"o"}, {"cap", 0x2229,"(U"}, {"ccedil",0x00e7,"c,"}, {"cedil", 0x00b8,","},
+ {"cent", 0x00a2,"c"}, {"chi", 0x03c7,"x"}, {"circ", 0x02c6,"^"}, {"clubs", 0x2663,"cC"},
+ {"cong", 0x2245,"?="}, {"copy", 0x00a9,"(c)"}, {"crarr", 0x21b5,"RET"},{"cup", 0x222a,")U"},
+ {"curren", 0x00a4,"CUR"}, {"dArr", 0x21d3,"vv"}, {"dagger",0x2020,"/-"}, {"darr", 0x2193,"-v"},
+ {"deg", 0x00b0,"DEG"}, {"delta", 0x03b4,"d"}, {"diams", 0x2666,"cD-"},{"divide",0x00f7,"-"},
+ {"eacute", 0x00e9,"e"}, {"ecirc", 0x00ea,"e"}, {"egrave",0x00e8,"e"}, {"empty", 0x2205,"{}"},
+ {"emsp", 0x2003," "}, {"ensp", 0x2002," "}, {"epsilon",0x03b5,"e"}, {"equiv", 0x2261,"=3"},
+ {"eta", 0x03b7,"y"}, {"eth", 0x00f0,"d-"}, {"euml", 0x00eb,"e"}, {"euro", 0x20ac,"EUR"},
+ {"exist", 0x2203,"TE"}, {"fnof", 0x0192," f"}, {"forall",0x2200,"FA"}, {"frac12",0x00bd," 1/2"},
+ {"frac14", 0x00bc," 1/4"},{"frac34", 0x00be," 3/4"},{"frasl", 0x2044,"/"}, {"gamma", 0x03b3,"g"},
+ {"ge", 0x2265,">="}, {"gt", 0x003e,">"}, {"hArr", 0x21d4,"<=>"},{"harr", 0x2194,"<->"},
+ {"hearts", 0x2665,"cH-"}, {"hellip", 0x2026,"..."}, {"iacute",0x00ed,"i"}, {"icirc", 0x00ee,"i"},
+ {"iexcl", 0x00a1,"!"}, {"igrave", 0x00ec,"`i"}, {"image", 0x2111,"Im"}, {"infin", 0x221e,"infty"},
+ {"int", 0x222b,"\"int "},{"iota", 0x03b9,"i"}, {"iquest",0x00bf,"?"}, {"isin", 0x2208,"(-"},
+ {"iuml", 0x00ef,"i"}, {"kappa", 0x03ba,"k"}, {"lArr", 0x21d0,"<="}, {"lambda",0x03bb,"l"},
+ {"lang", 0x2329,""}, {"laquo", 0x00ab,"<<"}, {"larr", 0x2190,"<-"}, {"lceil", 0x2308,"<7"},
+ {"ldquo", 0x201c,"\""}, {"le", 0x2264,"=<"}, {"lfloor",0x230a,"7<"}, {"lowast",0x2217,"*"},
+ {"loz", 0x25ca,"LZ"}, {"lrm", 0x200e,"(->)"},{"lsaquo",0x2039,"<"}, {"lsquo", 0x2018,"`"},
+ {"lt", 0x003c,"<"}, {"macr", 0x00af,"-"}, {"mdash", 0x2014,"--"}, {"micro", 0x00b5,"u"},
+ {"middot", 0x00b7,"."}, {"minus", 0x2212,"-"}, {"mu", 0x03bc,"m"}, {"nabla", 0x2207,"Nabla"},
+ {"nbsp", 0x0020," "}, {"ndash", 0x2013,"-"}, {"ne", 0x2260,"!="}, {"ni", 0x220b,"-)"},
+ {"not", 0x00ac,"NOT"}, {"notin", 0x2209,"!(-"}, {"nsub", 0x2284," !(C "},{"ntilde",0x00f1,"n"},
+ {"nu", 0x03bd,"n"}, {"oacute", 0x00f3,"o"}, {"ocirc", 0x00f4,"o"}, {"oelig", 0x0153,"oe"},
+ {"ograve", 0x00f2,"o"}, {"oline", 0x203e,"'o"}, {"omega", 0x03c9,"w"}, {"omicron",0x0ebf,"o"},
+ {"oplus", 0x2295,"(+)"}, {"or", 0x2228,"OR"}, {"ordf", 0x00aa,"-a"}, {"ordm", 0x00ba,"-o"},
+ {"oslash", 0x00f8,"o/"}, {"otilde", 0x00f5,"o"}, {"otimes",0x2297,"(x)"},{"ouml", 0x00f6,"oe"},
+ {"para", 0x00b6,"P"}, {"part", 0x2202,"\\\\partial"},{"permil",0x2030,"0/00"},{"perp",0x22a5,"-T"},
+ {"phi", 0x03c6,"f"}, {"pi", 0x03c0,"p"}, {"piv", 0x03d6,"pi "},{"plusmn",0x00b1,"+-"},
+ {"pound", 0x00a3,"-L-"}, {"prime", 0x2032,"'"}, {"prod", 0x220f,"\\\\prod"},{"prop",0x221d,"0("},
+ {"psi", 0x03c8,"q"}, {"quot", 0x0022,"\""}, {"rArr", 0x21d2,"=>"}, {"radic", 0x221a," SQRT "},
+ {"rang", 0x232a,"/>"}, {"raquo", 0x00bb,">>"}, {"rarr", 0x2192,"->"}, {"rceil", 0x2309,">7"},
+ {"rdquo", 0x201d,"\""}, {"real", 0x211c,"Re"}, {"reg", 0x00ae,"(R)"},{"rfloor",0x230b,"7>"},
+ {"rho", 0x03c1,"r"}, {"rlm", 0x200f,"(<-)"},{"rsaquo",0x203a,">"}, {"rsquo", 0x2019,"'"},
+ {"sbquo", 0x201a,"'"}, {"scaron", 0x0161,"s"}, {"sdot", 0x22c5," DOT "},{"sect",0x00a7,"S"},
+ {"shy", 0x00ad,""}, {"sigma", 0x03c3,"s"}, {"sigmaf",0x03c2,"*s"}, {"sim", 0x223c,"?1"},
+ {"spades", 0x2660,"cS"}, {"sub", 0x2282,"(C"}, {"sube", 0x2286,"(_"}, {"sum", 0x2211,"\\\\sum"},
+ {"sup", 0x2283,")C"}, {"sup1", 0x00b9,"^1"}, {"sup2", 0x00b2,"^2"}, {"sup3", 0x00b3,"^3"},
+ {"supe", 0x2287,")_"}, {"szlig", 0x00df,"ss"}, {"tau", 0x03c4,"t"}, {"there4",0x2234,"."},
+ {"theta", 0x03b8,"th"}, {"thetasym",0x03d1,"theta "},{"thinsp",0x2009," "},{"thorn",0x00fe,"th"},
+ {"tilde", 0x02dc,"~"}, {"times", 0x00d7," *"}, {"trade", 0x2122,"(TM)"},{"uArr", 0x21d1,"^^"},
+ {"uacute", 0x00fa,"u"}, {"uarr", 0x2191,"-^"}, {"ucirc", 0x00fb,"u"}, {"ugrave",0x00f9,"u"},
+ {"uml", 0x00a8,"\""}, {"upsih", 0x03d2,"upsi "},{"upsilon",0x03c5,"u"},{"uuml", 0x00fc,"ue"},
+ {"weierp", 0x2118,"P"}, {"xi", 0x03be,"c"}, {"yacute",0x00fd,"y"}, {"yen", 0x00a5,"YEN"},
+ {"yuml", 0x00ff,"y"}, {"zeta", 0x03b6,"z"}, {"zwj", 0x200d,""}, {"zwnj", 0x200c,""}
};
@@ -713,15 +716,19 @@ static int Html_entity_search(char *key)
}
/*
- * Given an entity, return the ISO-Latin1 character code.
- * (-1 if not a valid entity)
+ * Given an entity, return the newly allocated char string which contains
+ * ISO-Latin1 char if it can be found or ASCII-representation of entity.
+ * (0 if not a valid entity)
*/
-static gint Html_parse_entity(const gchar *token, gint toksize)
+static char* Html_parse_entity(const gchar *token, gint toksize)
{
gint base, isocode, i;
gchar *eoe, *name;
+ char *result;
+
+ g_return_val_if_fail (token[0] == '&', 0);
- g_return_val_if_fail (token[0] == '&', -1);
+ result = 0;
eoe = (toksize) ? memchr(token, ';', toksize) : strchr(token, ';');
if (eoe) {
@@ -729,16 +736,38 @@ static gint Html_parse_entity(const gcha
/* Numeric token */
base = (token[2] == 'x' || token[2] == 'X') ? 16 : 10;
isocode = strtol(token + 2 + (base==16), NULL, base);
- return (isocode > 0 && isocode <= 255) ? isocode : -1;
+ if (isocode > 0 && isocode <= 255)
+ {
+ result = g_new (char, 2);
+ result [0] = isocode;
+ result [1] = '\0';
+ }
+ else
+ /* Slowly find an entity by code */
+ for (i = 0; i < NumEnt; i++)
+ if (Entities[i].isocode == isocode)
+ result = g_strdup (Entities[i].represent);
} else {
/* Search for named entity */
name = g_strndup(token + 1, eoe - token - 1);
i = Html_entity_search(name);
g_free(name);
- return (i != -1) ? Entities[i].isocode : -1;
+ if (i != -1)
+ {
+ isocode = Entities[i].isocode;
+ if (isocode > 0 && isocode <= 255)
+ {
+ result = g_new (char, 2);
+ result [0] = isocode;
+ result [1] = '\0';
+ }
+ else
+ result = g_strdup (Entities[i].represent);
+ }
}
}
- return -1;
+ /* If entity was not found then result = 0 */
+ return result;
}
/*
@@ -748,16 +777,19 @@ static gint Html_parse_entity(const gcha
static char *Html_parse_entities(gchar *token, gint toksize)
{
gchar *new_str;
- gint i, j, isocode;
+ gint i, j;
+ char *entstring;
if ( memchr(token, '&', toksize) == NULL )
return g_strndup(token, toksize);
- new_str = g_new(char, toksize + 1);
+ new_str = g_new(char, (toksize) * MaxRepresent + 1);
for (i = j = 0; i < toksize; i++) {
if (token[i] == '&' &&
- (isocode = Html_parse_entity(token + i, toksize - i)) != -1) {
- new_str[j++] = isocode;
+ (entstring = Html_parse_entity(token + i, toksize - i))) {
+ strcpy (new_str + j, entstring);
+ j += strlen (entstring);
+ g_free (entstring);
while (token[++i] != ';');
} else {
new_str[j++] = token[i];
@@ -3881,7 +3913,8 @@ static const char *Html_get_attr2(DilloH
const char *attrname,
DilloHtmlTagParsingFlags flags)
{
- gint i, isocode, Found = 0, delimiter = 0, attr_pos = 0;
+ gint i, Found = 0, delimiter = 0, attr_pos = 0;
+ char *entstring;
GString *Buf = html->attr_data;
DilloHtmlTagParsingState state = SEEK_ATTR_START;
@@ -3933,8 +3966,9 @@ static const char *Html_get_attr2(DilloH
tag[i] == delimiter) {
state = FINISHED;
} else if (tag[i] == '&' && (flags & HTML_ParseEntities)) {
- if ((isocode = Html_parse_entity(tag+i, tagsize-i)) != -1) {
- g_string_append_c(Buf, (gchar) isocode);
+ if ((entstring = Html_parse_entity(tag+i, tagsize-i))) {
+ g_string_append(Buf, entstring);
+ g_free (entstring);
while (tag[++i] != ';');
} else {
g_string_append_c(Buf, tag[i]);