/* * Sylpheed -- a GTK+ based, lightweight, and fast e-mail client * Copyright (C) 1999-2003 Hiroyuki Yamamoto * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include #include "html.h" #include "codeconv.h" #include "utils.h" #define HTMLBUFSIZE 8192 #define HR_STR "------------------------------------------------" typedef struct _HTMLSymbol HTMLSymbol; struct _HTMLSymbol { gchar *const key; gchar *const val; }; static HTMLSymbol symbol_list[] = { {"<" , "<"}, {">" , ">"}, {"&" , "&"}, {""" , "\""}, {" " , " "}, {"™" , "(TM)"}, {"™", "(TM)"}, }; static HTMLSymbol ascii_symbol_list[] = { {"¡" , "^!"}, {"¦", "|"}, {"©" , "(C)"}, {"«" , "<<"}, {"®" , "(R)"}, {"²" , "^2"}, {"³" , "^3"}, {"´" , "'"}, {"¸" , ","}, {"¹" , "^1"}, {"»" , ">>"}, {"¼", "1/4"}, {"½", "1/2"}, {"¾", "3/4"}, {"¿", "^?"}, {"À", "A`"}, {"Á", "A'"}, {"Â" , "A^"}, {"Ã", "A~"}, {"Æ" , "AE"}, {"È", "E`"}, {"É", "E'"}, {"Ê" , "E^"}, {"Ì", "I`"}, {"Í", "I'"}, {"Î" , "I^"}, {"Ñ", "N~"}, {"Ò", "O`"}, {"Ó", "O'"}, {"Ô" , "O^"}, {"Õ", "O~"}, {"Ù", "U`"}, {"Ú", "U'"}, {"Û" , "U^"}, {"Ý", "Y'"}, {"à", "a`"}, {"á", "a'"}, {"â" , "a^"}, {"ã", "a~"}, {"æ" , "ae"}, {"è", "e`"}, {"é", "e'"}, {"ê" , "e^"}, {"ì", "i`"}, {"í", "i'"}, {"î" , "i^"}, {"ñ", "n~"}, {"ò", "o`"}, {"ó", "o'"}, {"ô" , "o^"}, {"õ", "o~"}, {"ù", "u`"}, {"ú", "u'"}, {"û" , "u^"}, {"ý", "y'"}, }; static GHashTable *default_symbol_table; static HTMLState html_read_line (HTMLParser *parser); static void html_append_char (HTMLParser *parser, gchar ch); static void html_append_str (HTMLParser *parser, const gchar *str, gint len); static HTMLState html_parse_tag (HTMLParser *parser); static void html_parse_special (HTMLParser *parser); static void html_get_parenthesis (HTMLParser *parser, gchar *buf, gint len); HTMLParser *html_parser_new(FILE *fp, CodeConverter *conv) { HTMLParser *parser; g_return_val_if_fail(fp != NULL, NULL); g_return_val_if_fail(conv != NULL, NULL); parser = g_new0(HTMLParser, 1); parser->fp = fp; parser->conv = conv; parser->str = g_string_new(NULL); parser->buf = g_string_new(NULL); parser->bufp = parser->buf->str; parser->state = HTML_NORMAL; parser->href = NULL; parser->newline = TRUE; parser->empty_line = TRUE; parser->space = FALSE; parser->pre = FALSE; #define SYMBOL_TABLE_ADD(table, list) \ { \ gint i; \ \ for (i = 0; i < sizeof(list) / sizeof(list[0]); i++) \ g_hash_table_insert(table, list[i].key, list[i].val); \ } if (!default_symbol_table) { default_symbol_table = g_hash_table_new(g_str_hash, g_str_equal); SYMBOL_TABLE_ADD(default_symbol_table, symbol_list); SYMBOL_TABLE_ADD(default_symbol_table, ascii_symbol_list); } #undef SYMBOL_TABLE_ADD parser->symbol_table = default_symbol_table; return parser; } void html_parser_destroy(HTMLParser *parser) { g_string_free(parser->str, TRUE); g_string_free(parser->buf, TRUE); g_free(parser->href); g_free(parser); } gchar *html_parse(HTMLParser *parser) { parser->state = HTML_NORMAL; g_string_truncate(parser->str, 0); if (*parser->bufp == '\0') { g_string_truncate(parser->buf, 0); parser->bufp = parser->buf->str; if (html_read_line(parser) == HTML_EOF) return NULL; } while (*parser->bufp != '\0') { switch (*parser->bufp) { case '<': if (parser->str->len == 0) html_parse_tag(parser); else return parser->str->str; break; case '&': html_parse_special(parser); break; case ' ': case '\t': case '\r': case '\n': if (parser->bufp[0] == '\r' && parser->bufp[1] == '\n') parser->bufp++; if (!parser->pre) { if (!parser->newline) parser->space = TRUE; parser->bufp++; break; } /* fallthrough */ default: html_append_char(parser, *parser->bufp++); } } return parser->str->str; } static HTMLState html_read_line(HTMLParser *parser) { gchar buf[HTMLBUFSIZE]; gchar buf2[HTMLBUFSIZE]; gint index; if (fgets(buf, sizeof(buf), parser->fp) == NULL) { parser->state = HTML_EOF; return HTML_EOF; } if (conv_convert(parser->conv, buf2, sizeof(buf2), buf) < 0) { index = parser->bufp - parser->buf->str; conv_utf8todisp(buf2, sizeof(buf2), buf); g_string_append(parser->buf, buf2); parser->bufp = parser->buf->str + index; return HTML_CONV_FAILED; } index = parser->bufp - parser->buf->str; g_string_append(parser->buf, buf2); parser->bufp = parser->buf->str + index; return HTML_NORMAL; } static void html_append_char(HTMLParser *parser, gchar ch) { GString *str = parser->str; if (!parser->pre && parser->space) { g_string_append_c(str, ' '); parser->space = FALSE; } g_string_append_c(str, ch); parser->empty_line = FALSE; if (ch == '\n') { parser->newline = TRUE; if (str->len > 1 && str->str[str->len - 2] == '\n') parser->empty_line = TRUE; } else parser->newline = FALSE; } static void html_append_str(HTMLParser *parser, const gchar *str, gint len) { GString *string = parser->str; if (!parser->pre && parser->space) { g_string_append_c(string, ' '); parser->space = FALSE; } if (len == 0) return; if (len < 0) g_string_append(string, str); else { gchar *s; Xstrndup_a(s, str, len, return); g_string_append(string, s); } parser->empty_line = FALSE; if (string->len > 0 && string->str[string->len - 1] == '\n') { parser->newline = TRUE; if (string->len > 1 && string->str[string->len - 2] == '\n') parser->empty_line = TRUE; } else parser->newline = FALSE; } static HTMLTag *html_get_tag(const gchar *str) { HTMLTag *tag; gchar *tmp; guchar *tmpp; g_return_val_if_fail(str != NULL, NULL); if (*str == '\0' || *str == '!') return NULL; Xstrdup_a(tmp, str, return NULL); tag = g_new0(HTMLTag, 1); for (tmpp = tmp; *tmpp != '\0' && !g_ascii_isspace(*tmpp); tmpp++) ; if (*tmpp == '\0') { g_strdown(tmp); tag->name = g_strdup(tmp); return tag; } else { *tmpp++ = '\0'; g_strdown(tmp); tag->name = g_strdup(tmp); } while (*tmpp != '\0') { HTMLAttr *attr; gchar *attr_name; gchar *attr_value; gchar *p; gchar quote; while (g_ascii_isspace(*tmpp)) tmpp++; attr_name = tmpp; while (*tmpp != '\0' && !g_ascii_isspace(*tmpp) && *tmpp != '=') tmpp++; if (*tmpp != '\0' && *tmpp != '=') { *tmpp++ = '\0'; while (g_ascii_isspace(*tmpp)) tmpp++; } if (*tmpp == '=') { *tmpp++ = '\0'; while (g_ascii_isspace(*tmpp)) tmpp++; if (*tmpp == '"' || *tmpp == '\'') { /* name="value" */ quote = *tmpp; tmpp++; attr_value = tmpp; if ((p = strchr(attr_value, quote)) == NULL) { g_warning("html_get_tag(): syntax error in tag: '%s'\n", str); return tag; } tmpp = p; *tmpp++ = '\0'; while (g_ascii_isspace(*tmpp)) tmpp++; } else { /* name=value */ attr_value = tmpp; while (*tmpp != '\0' && !g_ascii_isspace(*tmpp)) tmpp++; if (*tmpp != '\0') *tmpp++ = '\0'; } } else attr_value = ""; g_strchomp(attr_name); g_strdown(attr_name); attr = g_new(HTMLAttr, 1); attr->name = g_strdup(attr_name); attr->value = g_strdup(attr_value); tag->attr = g_list_append(tag->attr, attr); } return tag; } static void html_free_tag(HTMLTag *tag) { if (!tag) return; g_free(tag->name); while (tag->attr != NULL) { HTMLAttr *attr = (HTMLAttr *)tag->attr->data; g_free(attr->name); g_free(attr->value); g_free(attr); tag->attr = g_list_remove(tag->attr, tag->attr->data); } g_free(tag); } static HTMLState html_parse_tag(HTMLParser *parser) { gchar buf[HTMLBUFSIZE]; HTMLTag *tag; html_get_parenthesis(parser, buf, sizeof(buf)); tag = html_get_tag(buf); parser->state = HTML_UNKNOWN; if (!tag) return HTML_UNKNOWN; if (!strcmp(tag->name, "br")) { parser->space = FALSE; html_append_char(parser, '\n'); parser->state = HTML_BR; } else if (!strcmp(tag->name, "a")) { if (tag->attr && tag->attr->data && !strcmp(((HTMLAttr *)tag->attr->data)->name, "href")) { g_free(parser->href); parser->href = g_strdup(((HTMLAttr *)tag->attr->data)->value); parser->state = HTML_HREF; } } else if (!strcmp(tag->name, "/a")) { g_free(parser->href); parser->href = NULL; parser->state = HTML_NORMAL; } else if (!strcmp(tag->name, "p")) { parser->space = FALSE; if (!parser->empty_line) { parser->space = FALSE; if (!parser->newline) html_append_char(parser, '\n'); html_append_char(parser, '\n'); } parser->state = HTML_PAR; } else if (!strcmp(tag->name, "pre")) { parser->pre = TRUE; parser->state = HTML_PRE; } else if (!strcmp(tag->name, "/pre")) { parser->pre = FALSE; parser->state = HTML_NORMAL; } else if (!strcmp(tag->name, "hr")) { if (!parser->newline) { parser->space = FALSE; html_append_char(parser, '\n'); } html_append_str(parser, HR_STR "\n", -1); parser->state = HTML_HR; } else if (!strcmp(tag->name, "div") || !strcmp(tag->name, "ul") || !strcmp(tag->name, "li") || !strcmp(tag->name, "table") || !strcmp(tag->name, "tr") || (tag->name[0] == 'h' && g_ascii_isdigit(tag->name[1]))) { if (!parser->newline) { parser->space = FALSE; html_append_char(parser, '\n'); } parser->state = HTML_NORMAL; } else if (!strcmp(tag->name, "/table") || (tag->name[0] == '/' && tag->name[1] == 'h' && g_ascii_isdigit(tag->name[1]))) { if (!parser->empty_line) { parser->space = FALSE; if (!parser->newline) html_append_char(parser, '\n'); html_append_char(parser, '\n'); } parser->state = HTML_NORMAL; } else if (!strcmp(tag->name, "/div") || !strcmp(tag->name, "/ul") || !strcmp(tag->name, "/li")) { if (!parser->newline) { parser->space = FALSE; html_append_char(parser, '\n'); } parser->state = HTML_NORMAL; } html_free_tag(tag); return parser->state; } static void html_parse_special(HTMLParser *parser) { gchar symbol_name[9]; gint n; const gchar *val; parser->state = HTML_UNKNOWN; g_return_if_fail(*parser->bufp == '&'); /* &foo; */ for (n = 0; parser->bufp[n] != '\0' && parser->bufp[n] != ';'; n++) ; if (n > 7 || parser->bufp[n] != ';') { /* output literal `&' */ html_append_char(parser, *parser->bufp++); parser->state = HTML_NORMAL; return; } strncpy2(symbol_name, parser->bufp, n + 2); parser->bufp += n + 1; if ((val = g_hash_table_lookup(parser->symbol_table, symbol_name)) != NULL) { html_append_str(parser, val, -1); parser->state = HTML_NORMAL; return; } else if (symbol_name[1] == '#' && g_ascii_isdigit(symbol_name[2])) { gint ch; /* TODO: support other entity references */ ch = atoi(symbol_name + 2); if (g_ascii_isprint(ch)) { html_append_char(parser, ch); parser->state = HTML_NORMAL; return; } } html_append_str(parser, symbol_name, -1); } static void html_get_parenthesis(HTMLParser *parser, gchar *buf, gint len) { gchar *p; buf[0] = '\0'; g_return_if_fail(*parser->bufp == '<'); /* ignore comment / CSS / script stuff */ if (!strncmp(parser->bufp, "")) == NULL) if (html_read_line(parser) == HTML_EOF) return; parser->bufp = p + 3; return; } if (!g_strncasecmp(parser->bufp, "bufp += 6; while ((p = strcasestr(parser->bufp, "")) == NULL) if (html_read_line(parser) == HTML_EOF) return; parser->bufp = p + 8; return; } if (!g_strncasecmp(parser->bufp, "bufp += 7; while ((p = strcasestr(parser->bufp, "")) == NULL) if (html_read_line(parser) == HTML_EOF) return; parser->bufp = p + 9; return; } parser->bufp++; while ((p = strchr(parser->bufp, '>')) == NULL) if (html_read_line(parser) == HTML_EOF) return; strncpy2(buf, parser->bufp, MIN(p - parser->bufp + 1, len)); g_strstrip(buf); parser->bufp = p + 1; }