diff -pruN /temp/dillo/src/html.c dillo-char-representation/src/html.c --- /temp/dillo/src/html.c 2003-05-05 14:44:42.000000000 +0700 +++ dillo-char-representation/src/html.c 2003-05-13 15:03:27.000000000 +0700 @@ -620,73 +620,76 @@ static void Html_stash_init(DilloHtml *h typedef struct { char *entity; int isocode; + char *represent; } Ent_t; #define NumEnt 252 +#define MaxRepresent 10 + static const Ent_t Entities[NumEnt] = { - {"AElig",0306}, {"Aacute",0301}, {"Acirc",0302}, {"Agrave",0300}, - {"Alpha",01621},{"Aring",0305}, {"Atilde",0303}, {"Auml",0304}, - {"Beta",01622}, {"Ccedil",0307}, {"Chi",01647}, {"Dagger",020041}, - {"Delta",01624},{"ETH",0320}, {"Eacute",0311}, {"Ecirc",0312}, - {"Egrave",0310},{"Epsilon",01625},{"Eta",01627}, {"Euml",0313}, - {"Gamma",01623},{"Iacute",0315}, {"Icirc",0316}, {"Igrave",0314}, - {"Iota",01631}, {"Iuml",0317}, {"Kappa",01632}, {"Lambda",01633}, - {"Mu",01634}, {"Ntilde",0321}, {"Nu",01635}, {"OElig",0522}, - {"Oacute",0323},{"Ocirc",0324}, {"Ograve",0322}, {"Omega",01651}, - {"Omicron",01637},{"Oslash",0330},{"Otilde",0325},{"Ouml",0326}, - {"Phi",01646}, {"Pi",01640}, {"Prime",020063},{"Psi",01650}, - {"Rho",01641}, {"Scaron",0540}, {"Sigma",01643}, {"THORN",0336}, - {"Tau",01644}, {"Theta",01630}, {"Uacute",0332}, {"Ucirc",0333}, - {"Ugrave",0331},{"Upsilon",01645},{"Uuml",0334}, {"Xi",01636}, - {"Yacute",0335},{"Yuml",0570}, {"Zeta",01626}, {"aacute",0341}, - {"acirc",0342}, {"acute",0264}, {"aelig",0346}, {"agrave",0340}, - {"alefsym",020465},{"alpha",01661},{"amp",38}, {"and",021047}, - {"ang",021040}, {"aring",0345}, {"asymp",021110},{"atilde",0343}, - {"auml",0344}, {"bdquo",020036},{"beta",01662}, {"brvbar",0246}, - {"bull",020042},{"cap",021051}, {"ccedil",0347}, {"cedil",0270}, - {"cent",0242}, {"chi",01707}, {"circ",01306}, {"clubs",023143}, - {"cong",021105},{"copy",0251}, {"crarr",020665},{"cup",021052}, - {"curren",0244},{"dArr",020723}, {"dagger",020040},{"darr",020623}, - {"deg",0260}, {"delta",01664}, {"diams",023146},{"divide",0367}, - {"eacute",0351},{"ecirc",0352}, {"egrave",0350}, {"empty",021005}, - {"emsp",020003},{"ensp",020002}, {"epsilon",01665},{"equiv",021141}, - {"eta",01667}, {"eth",0360}, {"euml",0353}, {"euro",020254}, - {"exist",021003},{"fnof",0622}, {"forall",021000},{"frac12",0275}, - {"frac14",0274},{"frac34",0276}, {"frasl",020104},{"gamma",01663}, - {"ge",021145}, {"gt",62}, {"hArr",020724}, {"harr",020624}, - {"hearts",023145},{"hellip",020046},{"iacute",0355},{"icirc",0356}, - {"iexcl",0241}, {"igrave",0354}, {"image",020421},{"infin",021036}, - {"int",021053}, {"iota",01671}, {"iquest",0277}, {"isin",021010}, - {"iuml",0357}, {"kappa",01672}, {"lArr",020720}, {"lambda",01673}, - {"lang",021451},{"laquo",0253}, {"larr",020620}, {"lceil",021410}, - {"ldquo",020034},{"le",021144}, {"lfloor",021412},{"lowast",021027}, - {"loz",022712}, {"lrm",020016}, {"lsaquo",020071},{"lsquo",020030}, - {"lt",60}, {"macr",0257}, {"mdash",020024},{"micro",0265}, - {"middot",0267},{"minus",021022},{"mu",01674}, {"nabla",021007}, - {"nbsp",32}, {"ndash",020023},{"ne",021140}, {"ni",021013}, - {"not",0254}, {"notin",021011},{"nsub",021204}, {"ntilde",0361}, - {"nu",01675}, {"oacute",0363}, {"ocirc",0364}, {"oelig",0523}, - {"ograve",0362},{"oline",020076},{"omega",01711}, {"omicron",01677}, - {"oplus",021225},{"or",021050}, {"ordf",0252}, {"ordm",0272}, - {"oslash",0370},{"otilde",0365}, {"otimes",021227},{"ouml",0366}, - {"para",0266}, {"part",021002}, {"permil",020060},{"perp",021245}, - {"phi",01706}, {"pi",01700}, {"piv",01726}, {"plusmn",0261}, - {"pound",0243}, {"prime",020062},{"prod",021017}, {"prop",021035}, - {"psi",01710}, {"quot",34}, {"rArr",020722}, {"radic",021032}, - {"rang",021452},{"raquo",0273}, {"rarr",020622}, {"rceil",021411}, - {"rdquo",020035},{"real",020434},{"reg",0256}, {"rfloor",021413}, - {"rho",01701}, {"rlm",020017}, {"rsaquo",020072},{"rsquo",020031}, - {"sbquo",020032},{"scaron",0541},{"sdot",021305}, {"sect",0247}, - {"shy",0255}, {"sigma",01703}, {"sigmaf",01702},{"sim",021074}, - {"spades",023140},{"sub",021202},{"sube",021206}, {"sum",021021}, - {"sup",021203}, {"sup1",0271}, {"sup2",0262}, {"sup3",0263}, - {"supe",021207},{"szlig",0337}, {"tau",01704}, {"there4",021064}, - {"theta",01670},{"thetasym",01721},{"thinsp",020011},{"thorn",0376}, - {"tilde",01334},{"times",0327}, {"trade",020442},{"uArr",020721}, - {"uacute",0372},{"uarr",020621}, {"ucirc",0373}, {"ugrave",0371}, - {"uml",0250}, {"upsih",01722}, {"upsilon",01705},{"uuml",0374}, - {"weierp",020430},{"xi",01676}, {"yacute",0375}, {"yen",0245}, - {"yuml",0377}, {"zeta",01666}, {"zwj",020015}, {"zwnj",020014} + {"AElig", 0x00c6,"AE"}, {"Aacute", 0x00c1,"A"}, {"Acirc", 0x00c2,"A"}, {"Agrave",0x00c0,"A"}, + {"Alpha", 0x0391,"A"}, {"Aring", 0x00c5,"A"}, {"Atilde",0x00c3,"A"}, {"Auml", 0x00c4,"Ae"}, + {"Beta", 0x0392,"B"}, {"Ccedil", 0x00c7,"C,"}, {"Chi", 0x03a7,"X"}, {"Dagger",0x2021,"/="}, + {"Delta", 0x0394,"D"}, {"ETH", 0x00d0,"D-"}, {"Eacute",0x00c9,"E"}, {"Ecirc", 0x00ca,"E"}, + {"Egrave", 0x00c8,"E"}, {"Epsilon",0x0395,"E"}, {"Eta", 0x0397,"Y"}, {"Euml", 0x00cb,"E"}, + {"Gamma", 0x0393,"G"}, {"Iacute", 0x00cd,"I"}, {"Icirc", 0x00ce,"I"}, {"Igrave",0x00cc,"I"}, + {"Iota", 0x0399,"I"}, {"Iuml", 0x00cf,"I"}, {"Kappa", 0x039a,"K"}, {"Lambda",0x039b,"L"}, + {"Mu", 0x039c,"M"}, {"Ntilde", 0x00d1,"N"}, {"Nu", 0x039d,"N"}, {"OElig", 0x0152,"OE"}, + {"Oacute", 0x00d3,"O"}, {"Ocirc", 0x00d4,"O"}, {"Ograve",0x00d2,"O"}, {"Omega", 0x03a9,"W*"}, + {"Omicron",0x039f,"O"}, {"Oslash", 0x00d8,"O/"}, {"Otilde",0x00d5,"O"}, {"Ouml", 0x00d6,"O"}, + {"Phi", 0x03a6,"F"}, {"Pi", 0x03a0,"P"}, {"Prime", 0x2033,"''"}, {"Psi", 0x03a8,"Q"}, + {"Rho", 0x03a1,"R"}, {"Scaron", 0x0160,"S"}, {"Sigma", 0x03a3,"S"}, {"THORN", 0x00de,"TH"}, + {"Tau", 0x03a4,"T"}, {"Theta", 0x0398,"TH"}, {"Uacute",0x00da,"U"}, {"Ucirc", 0x00db,"U"}, + {"Ugrave", 0x00d9,"U"}, {"Upsilon",0x03a5,"U"}, {"Uuml", 0x00dc,"U"}, {"Xi", 0x039e,"X"}, + {"Yacute", 0x00dd,"Y"}, {"Yuml", 0x0178,"Y"}, {"Zeta", 0x0396,"Z"}, {"aacute",0x00e1,"a"}, + {"acirc", 0x00e2,"a"}, {"acute", 0x00b4,"'"}, {"aelig", 0x00e6,"ae"}, {"agrave",0x00e0,"`a"}, + {"alefsym",0x2135,"Aleph"},{"alpha",0x03b1,"a"}, {"amp", 0x0026,"&"}, {"and", 0x2227,"AND"}, + {"ang", 0x2220,"-V"}, {"aring", 0x00e5,"aa"}, {"asymp", 0x2248,"~="}, {"atilde",0x00e3,"a"}, + {"auml", 0x00e4,"a"}, {"bdquo", 0x201e,"\""}, {"beta", 0x03b2,"b"}, {"brvbar",0x00a6,"|"}, + {"bull", 0x2022,"o"}, {"cap", 0x2229,"(U"}, {"ccedil",0x00e7,"c,"}, {"cedil", 0x00b8,","}, + {"cent", 0x00a2,"c"}, {"chi", 0x03c7,"x"}, {"circ", 0x02c6,"^"}, {"clubs", 0x2663,"cC"}, + {"cong", 0x2245,"?="}, {"copy", 0x00a9,"(c)"}, {"crarr", 0x21b5,"RET"},{"cup", 0x222a,")U"}, + {"curren", 0x00a4,"CUR"}, {"dArr", 0x21d3,"vv"}, {"dagger",0x2020,"/-"}, {"darr", 0x2193,"-v"}, + {"deg", 0x00b0,"DEG"}, {"delta", 0x03b4,"d"}, {"diams", 0x2666,"cD-"},{"divide",0x00f7,"-"}, + {"eacute", 0x00e9,"e"}, {"ecirc", 0x00ea,"e"}, {"egrave",0x00e8,"e"}, {"empty", 0x2205,"{}"}, + {"emsp", 0x2003," "}, {"ensp", 0x2002," "}, {"epsilon",0x03b5,"e"}, {"equiv", 0x2261,"=3"}, + {"eta", 0x03b7,"y"}, {"eth", 0x00f0,"d-"}, {"euml", 0x00eb,"e"}, {"euro", 0x20ac,"EUR"}, + {"exist", 0x2203,"TE"}, {"fnof", 0x0192," f"}, {"forall",0x2200,"FA"}, {"frac12",0x00bd," 1/2"}, + {"frac14", 0x00bc," 1/4"},{"frac34", 0x00be," 3/4"},{"frasl", 0x2044,"/"}, {"gamma", 0x03b3,"g"}, + {"ge", 0x2265,">="}, {"gt", 0x003e,">"}, {"hArr", 0x21d4,"<=>"},{"harr", 0x2194,"<->"}, + {"hearts", 0x2665,"cH-"}, {"hellip", 0x2026,"..."}, {"iacute",0x00ed,"i"}, {"icirc", 0x00ee,"i"}, + {"iexcl", 0x00a1,"!"}, {"igrave", 0x00ec,"`i"}, {"image", 0x2111,"Im"}, {"infin", 0x221e,"infty"}, + {"int", 0x222b,"\"int "},{"iota", 0x03b9,"i"}, {"iquest",0x00bf,"?"}, {"isin", 0x2208,"(-"}, + {"iuml", 0x00ef,"i"}, {"kappa", 0x03ba,"k"}, {"lArr", 0x21d0,"<="}, {"lambda",0x03bb,"l"}, + {"lang", 0x2329,")"},{"lsaquo",0x2039,"<"}, {"lsquo", 0x2018,"`"}, + {"lt", 0x003c,"<"}, {"macr", 0x00af,"-"}, {"mdash", 0x2014,"--"}, {"micro", 0x00b5,"u"}, + {"middot", 0x00b7,"."}, {"minus", 0x2212,"-"}, {"mu", 0x03bc,"m"}, {"nabla", 0x2207,"Nabla"}, + {"nbsp", 0x0020," "}, {"ndash", 0x2013,"-"}, {"ne", 0x2260,"!="}, {"ni", 0x220b,"-)"}, + {"not", 0x00ac,"NOT"}, {"notin", 0x2209,"!(-"}, {"nsub", 0x2284," !(C "},{"ntilde",0x00f1,"n"}, + {"nu", 0x03bd,"n"}, {"oacute", 0x00f3,"o"}, {"ocirc", 0x00f4,"o"}, {"oelig", 0x0153,"oe"}, + {"ograve", 0x00f2,"o"}, {"oline", 0x203e,"'o"}, {"omega", 0x03c9,"w"}, {"omicron",0x0ebf,"o"}, + {"oplus", 0x2295,"(+)"}, {"or", 0x2228,"OR"}, {"ordf", 0x00aa,"-a"}, {"ordm", 0x00ba,"-o"}, + {"oslash", 0x00f8,"o/"}, {"otilde", 0x00f5,"o"}, {"otimes",0x2297,"(x)"},{"ouml", 0x00f6,"oe"}, + {"para", 0x00b6,"P"}, {"part", 0x2202,"\\\\partial"},{"permil",0x2030,"0/00"},{"perp",0x22a5,"-T"}, + {"phi", 0x03c6,"f"}, {"pi", 0x03c0,"p"}, {"piv", 0x03d6,"pi "},{"plusmn",0x00b1,"+-"}, + {"pound", 0x00a3,"-L-"}, {"prime", 0x2032,"'"}, {"prod", 0x220f,"\\\\prod"},{"prop",0x221d,"0("}, + {"psi", 0x03c8,"q"}, {"quot", 0x0022,"\""}, {"rArr", 0x21d2,"=>"}, {"radic", 0x221a," SQRT "}, + {"rang", 0x232a,"/>"}, {"raquo", 0x00bb,">>"}, {"rarr", 0x2192,"->"}, {"rceil", 0x2309,">7"}, + {"rdquo", 0x201d,"\""}, {"real", 0x211c,"Re"}, {"reg", 0x00ae,"(R)"},{"rfloor",0x230b,"7>"}, + {"rho", 0x03c1,"r"}, {"rlm", 0x200f,"(<-)"},{"rsaquo",0x203a,">"}, {"rsquo", 0x2019,"'"}, + {"sbquo", 0x201a,"'"}, {"scaron", 0x0161,"s"}, {"sdot", 0x22c5," DOT "},{"sect",0x00a7,"S"}, + {"shy", 0x00ad,""}, {"sigma", 0x03c3,"s"}, {"sigmaf",0x03c2,"*s"}, {"sim", 0x223c,"?1"}, + {"spades", 0x2660,"cS"}, {"sub", 0x2282,"(C"}, {"sube", 0x2286,"(_"}, {"sum", 0x2211,"\\\\sum"}, + {"sup", 0x2283,")C"}, {"sup1", 0x00b9,"^1"}, {"sup2", 0x00b2,"^2"}, {"sup3", 0x00b3,"^3"}, + {"supe", 0x2287,")_"}, {"szlig", 0x00df,"ss"}, {"tau", 0x03c4,"t"}, {"there4",0x2234,"."}, + {"theta", 0x03b8,"th"}, {"thetasym",0x03d1,"theta "},{"thinsp",0x2009," "},{"thorn",0x00fe,"th"}, + {"tilde", 0x02dc,"~"}, {"times", 0x00d7," *"}, {"trade", 0x2122,"(TM)"},{"uArr", 0x21d1,"^^"}, + {"uacute", 0x00fa,"u"}, {"uarr", 0x2191,"-^"}, {"ucirc", 0x00fb,"u"}, {"ugrave",0x00f9,"u"}, + {"uml", 0x00a8,"\""}, {"upsih", 0x03d2,"upsi "},{"upsilon",0x03c5,"u"},{"uuml", 0x00fc,"ue"}, + {"weierp", 0x2118,"P"}, {"xi", 0x03be,"c"}, {"yacute",0x00fd,"y"}, {"yen", 0x00a5,"YEN"}, + {"yuml", 0x00ff,"y"}, {"zeta", 0x03b6,"z"}, {"zwj", 0x200d,""}, {"zwnj", 0x200c,""} }; @@ -713,15 +716,19 @@ static int Html_entity_search(char *key) } /* - * Given an entity, return the ISO-Latin1 character code. - * (-1 if not a valid entity) + * Given an entity, return the newly allocated char string which contains + * ISO-Latin1 char if it can be found or ASCII-representation of entity. + * (0 if not a valid entity) */ -static gint Html_parse_entity(const gchar *token, gint toksize) +static char* Html_parse_entity(const gchar *token, gint toksize) { gint base, isocode, i; gchar *eoe, *name; + char *result; + + g_return_val_if_fail (token[0] == '&', 0); - g_return_val_if_fail (token[0] == '&', -1); + result = 0; eoe = (toksize) ? memchr(token, ';', toksize) : strchr(token, ';'); if (eoe) { @@ -729,16 +736,38 @@ static gint Html_parse_entity(const gcha /* Numeric token */ base = (token[2] == 'x' || token[2] == 'X') ? 16 : 10; isocode = strtol(token + 2 + (base==16), NULL, base); - return (isocode > 0 && isocode <= 255) ? isocode : -1; + if (isocode > 0 && isocode <= 255) + { + result = g_new (char, 2); + result [0] = isocode; + result [1] = '\0'; + } + else + /* Slowly find an entity by code */ + for (i = 0; i < NumEnt; i++) + if (Entities[i].isocode == isocode) + result = g_strdup (Entities[i].represent); } else { /* Search for named entity */ name = g_strndup(token + 1, eoe - token - 1); i = Html_entity_search(name); g_free(name); - return (i != -1) ? Entities[i].isocode : -1; + if (i != -1) + { + isocode = Entities[i].isocode; + if (isocode > 0 && isocode <= 255) + { + result = g_new (char, 2); + result [0] = isocode; + result [1] = '\0'; + } + else + result = g_strdup (Entities[i].represent); + } } } - return -1; + /* If entity was not found then result = 0 */ + return result; } /* @@ -748,16 +777,19 @@ static gint Html_parse_entity(const gcha static char *Html_parse_entities(gchar *token, gint toksize) { gchar *new_str; - gint i, j, isocode; + gint i, j; + char *entstring; if ( memchr(token, '&', toksize) == NULL ) return g_strndup(token, toksize); - new_str = g_new(char, toksize + 1); + new_str = g_new(char, (toksize) * MaxRepresent + 1); for (i = j = 0; i < toksize; i++) { if (token[i] == '&' && - (isocode = Html_parse_entity(token + i, toksize - i)) != -1) { - new_str[j++] = isocode; + (entstring = Html_parse_entity(token + i, toksize - i))) { + strcpy (new_str + j, entstring); + j += strlen (entstring); + g_free (entstring); while (token[++i] != ';'); } else { new_str[j++] = token[i]; @@ -3881,7 +3913,8 @@ static const char *Html_get_attr2(DilloH const char *attrname, DilloHtmlTagParsingFlags flags) { - gint i, isocode, Found = 0, delimiter = 0, attr_pos = 0; + gint i, Found = 0, delimiter = 0, attr_pos = 0; + char *entstring; GString *Buf = html->attr_data; DilloHtmlTagParsingState state = SEEK_ATTR_START; @@ -3933,8 +3966,9 @@ static const char *Html_get_attr2(DilloH tag[i] == delimiter) { state = FINISHED; } else if (tag[i] == '&' && (flags & HTML_ParseEntities)) { - if ((isocode = Html_parse_entity(tag+i, tagsize-i)) != -1) { - g_string_append_c(Buf, (gchar) isocode); + if ((entstring = Html_parse_entity(tag+i, tagsize-i))) { + g_string_append(Buf, entstring); + g_free (entstring); while (tag[++i] != ';'); } else { g_string_append_c(Buf, tag[i]);