/* * Sylpheed -- a GTK+ based, lightweight, and fast e-mail client * Copyright (C) 1999,2000 Hiroyuki Yamamoto * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include #include #include #include #include "html.h" #include "codeconv.h" #include "utils.h" #define HTMLBUFSIZE 8192 #define HR_STR "------------------------------------------------" typedef struct _HTMLSymbol HTMLSymbol; struct _HTMLSymbol { gchar *const key; gchar *const val; }; static HTMLSymbol symbol_list[] = { {"<" , "<"}, {">" , ">"}, {"&" , "&"}, {""" , "\""}, {" " , " "}, {"™" , "(TM)"}, {"™", "(TM)"}, }; static HTMLSymbol ascii_symbol_list[] = { {"¡" , "^!"}, {"¦", "|"}, {"©" , "(C)"}, {"«" , "<<"}, {"®" , "(R)"}, {"²" , "^2"}, {"³" , "^3"}, {"´" , "'"}, {"¸" , ","}, {"¹" , "^1"}, {"»" , ">>"}, {"¼", "1/4"}, {"½", "1/2"}, {"¾", "3/4"}, {"¿", "^?"}, {"À", "A`"}, {"Á", "A'"}, {"Â" , "A^"}, {"Ã", "A~"}, {"Æ" , "AE"}, {"È", "E`"}, {"É", "E'"}, {"Ê" , "E^"}, {"Ì", "I`"}, {"Í", "I'"}, {"Î" , "I^"}, {"Ñ", "N~"}, {"Ò", "O`"}, {"Ó", "O'"}, {"Ô" , "O^"}, {"Õ", "O~"}, {"Ù", "U`"}, {"Ú", "U'"}, {"Û" , "U^"}, {"Ý", "Y'"}, {"à", "a`"}, {"á", "a'"}, {"â" , "a^"}, {"ã", "a~"}, {"æ" , "ae"}, {"è", "e`"}, {"é", "e'"}, {"ê" , "e^"}, {"ì", "i`"}, {"í", "i'"}, {"î" , "i^"}, {"ñ", "n~"}, {"ò", "o`"}, {"ó", "o'"}, {"ô" , "o^"}, {"õ", "o~"}, {"ù", "u`"}, {"ú", "u'"}, {"û" , "u^"}, {"ý", "y'"}, }; static HTMLSymbol eucjp_symbol_list[] = { {"¡" , "^!"}, {"¢" , "\xa1\xf1"}, {"£" , "\xa1\xf2"}, {"¥" , "\xa1\xef"}, {"¦", "|"}, {"§" , "\xa1\xf8"}, {"¨" , "\xa1\xaf"}, {"©" , "(C)"}, {"«" , "<<"}, {"®" , "(R)"}, {"°" , "\xa1\xeb"}, {"±", "\xa1\xde"}, {"²" , "^2"}, {"³" , "^3"}, {"´" , "'"}, {"µ" , "\xa6\xcc"}, {"¶" , "\xa2\xf9"}, {"·", "\xa1\xa6"}, {"¸" , ","}, {"¹" , "^1"}, {"»" , ">>"}, {"¼", "1/4"}, {"½", "1/2"}, {"¾", "3/4"}, {"¿", "^?"}, {"À", "A`"}, {"Á", "A'"}, {"Â" , "A^"}, {"Ã", "A~"}, {"Ä" , "A\xa1\xaf"}, {"Å" , "A\xa1\xeb"}, {"Æ" , "AE"}, {"È", "E`"}, {"É", "E'"}, {"Ê" , "E^"}, {"Ë" , "E\xa1\xaf"}, {"Ì", "I`"}, {"Í", "I'"}, {"Î" , "I^"}, {"Ï" , "I\xa1\xaf"}, {"Ñ", "N~"}, {"Ò", "O`"}, {"Ó", "O'"}, {"Ô" , "O^"}, {"Õ", "O~"}, {"Ö" , "O\xa1\xaf"}, {"×" , "\xa1\xdf"}, {"Ù", "U`"}, {"Ú", "U'"}, {"Û" , "U^"}, {"Ü" , "U\xa1\xaf"}, {"Ý", "Y'"}, {"à", "a`"}, {"á", "a'"}, {"â" , "a^"}, {"ã", "a~"}, {"ä" , "a\xa1\xaf"}, {"å" , "a\xa1\xeb"}, {"æ" , "ae"}, {"è", "e`"}, {"é", "e'"}, {"ê" , "e^"}, {"ë" , "e\xa1\xaf"}, {"ì", "i`"}, {"í", "i'"}, {"î" , "i^"}, {"ï" , "i\xa1\xaf"}, {"ð" , "\xa2\xdf"}, {"ñ", "n~"}, {"ò", "o`"}, {"ó", "o'"}, {"ô" , "o^"}, {"õ", "o~"}, {"ö" , "o\xa1\xaf"}, {"÷", "\xa1\xe0"}, {"ù", "u`"}, {"ú", "u'"}, {"û" , "u^"}, {"ü" , "u\xa1\xaf"}, {"ý", "y'"}, {"ÿ" , "y\xa1\xaf"}, }; static HTMLSymbol latin_symbol_list[] = { {"¡" , "\xa1"}, {"¢" , "\xa2"}, {"£" , "\xa3"}, {"¤", "\xa4"}, {"¥" , "\xa5"}, {"¦", "\xa6"}, {"§" , "\xa7"}, {"¨" , "\xa8"}, {"©" , "\xa9"}, {"ª" , "\xaa"}, {"«" , "\xab"}, {"¬" , "\xac"}, {"­" , "\xad"}, {"®" , "\xae"}, {"¯" , "\xaf"}, {"°" , "\xb0"}, {"±", "\xb1"}, {"²" , "\xb2"}, {"³" , "\xb3"}, {"´" , "\xb4"}, {"µ" , "\xb5"}, {"¶" , "\xb6"}, {"·", "\xb7"}, {"¸" , "\xb8"}, {"¹" , "\xb9"}, {"º" , "\xba"}, {"»" , "\xbb"}, {"¼", "\xbc"}, {"½", "\xbd"}, {"¾", "\xbe"}, {"¿", "\xbf"}, {"À", "\xc0"}, {"Á", "\xc1"}, {"Â" , "\xc2"}, {"Ã", "\xc3"}, {"Ä" , "\xc4"}, {"Å" , "\xc5"}, {"Æ" , "\xc6"}, {"Ç", "\xc7"}, {"È", "\xc8"}, {"É", "\xc9"}, {"Ê" , "\xca"}, {"Ë" , "\xcb"}, {"Ì", "\xcc"}, {"Í", "\xcd"}, {"Î" , "\xce"}, {"Ï" , "\xcf"}, {"Ð" , "\xd0"}, {"Ñ", "\xd1"}, {"Ò", "\xd2"}, {"Ó", "\xd3"}, {"Ô" , "\xd4"}, {"Õ", "\xd5"}, {"Ö" , "\xd6"}, {"×" , "\xd7"}, {"Ø", "\xd8"}, {"Ù", "\xd9"}, {"Ú", "\xda"}, {"Û" , "\xdb"}, {"Ü" , "\xdc"}, {"Ý", "\xdd"}, {"Þ" , "\xde"}, {"ß" , "\xdf"}, {"à", "\xe0"}, {"á", "\xe1"}, {"â" , "\xe2"}, {"ã", "\xe3"}, {"ä" , "\xe4"}, {"å" , "\xe5"}, {"æ" , "\xe6"}, {"ç", "\xe7"}, {"è", "\xe8"}, {"é", "\xe9"}, {"ê" , "\xea"}, {"ë" , "\xeb"}, {"ì", "\xec"}, {"í", "\xed"}, {"î" , "\xee"}, {"ï" , "\xef"}, {"ð" , "\xf0"}, {"ñ", "\xf1"}, {"ò", "\xf2"}, {"ó", "\xf3"}, {"ô" , "\xf4"}, {"õ", "\xf5"}, {"ö" , "\xf6"}, {"÷", "\xf7"}, {"ø", "\xf8"}, {"ù", "\xf9"}, {"ú", "\xfa"}, {"û" , "\xfb"}, {"ü" , "\xfc"}, {"ý", "\xfd"}, {"þ" , "\xfe"}, {"ÿ" , "\xff"}, }; static GHashTable *default_symbol_table; static GHashTable *eucjp_symbol_table; static GHashTable *latin_symbol_table; static HTMLState html_read_line (HTMLParser *parser); static void html_append_char (HTMLParser *parser, gchar ch); static void html_append_str (HTMLParser *parser, const gchar *str, gint len); static HTMLState html_parse_tag (HTMLParser *parser); static void html_parse_special (HTMLParser *parser); static void html_get_parenthesis (HTMLParser *parser, gchar *buf, gint len); #if 0 static gint g_str_case_equal (gconstpointer v, gconstpointer v2); static guint g_str_case_hash (gconstpointer key); #endif HTMLParser *html_parser_new(FILE *fp, CodeConverter *conv) { HTMLParser *parser; g_return_val_if_fail(fp != NULL, NULL); g_return_val_if_fail(conv != NULL, NULL); parser = g_new0(HTMLParser, 1); parser->fp = fp; parser->conv = conv; parser->str = g_string_new(NULL); parser->buf = g_string_new(NULL); parser->bufp = parser->buf->str; parser->newline = TRUE; parser->empty_line = TRUE; parser->space = FALSE; parser->pre = FALSE; #define SYMBOL_TABLE_ADD(table, list) \ { \ gint i; \ \ for (i = 0; i < sizeof(list) / sizeof(list[0]); i++) \ g_hash_table_insert(table, list[i].key, list[i].val); \ } if (!default_symbol_table) { default_symbol_table = g_hash_table_new(g_str_hash, g_str_equal); SYMBOL_TABLE_ADD(default_symbol_table, symbol_list); SYMBOL_TABLE_ADD(default_symbol_table, ascii_symbol_list); } if (!eucjp_symbol_table) { eucjp_symbol_table = g_hash_table_new(g_str_hash, g_str_equal); SYMBOL_TABLE_ADD(eucjp_symbol_table, symbol_list); SYMBOL_TABLE_ADD(eucjp_symbol_table, eucjp_symbol_list); } if (!latin_symbol_table) { latin_symbol_table = g_hash_table_new(g_str_hash, g_str_equal); SYMBOL_TABLE_ADD(latin_symbol_table, symbol_list); SYMBOL_TABLE_ADD(latin_symbol_table, latin_symbol_list); } #undef SYMBOL_TABLE_ADD if (conv->charset == C_ISO_8859_1) parser->symbol_table = latin_symbol_table; else if ((conv->charset == C_ISO_2022_JP || conv->charset == C_ISO_2022_JP_2 || conv->charset == C_EUC_JP || conv->charset == C_SHIFT_JIS) && conv_get_current_charset() == C_EUC_JP) parser->symbol_table = eucjp_symbol_table; else parser->symbol_table = default_symbol_table; return parser; } void html_parser_destroy(HTMLParser *parser) { g_string_free(parser->str, TRUE); g_string_free(parser->buf, TRUE); g_free(parser); } gchar *html_parse(HTMLParser *parser) { parser->state = HTML_NORMAL; g_string_truncate(parser->str, 0); if (*parser->bufp == '\0') { g_string_truncate(parser->buf, 0); parser->bufp = parser->buf->str; if (html_read_line(parser) == HTML_EOF) return NULL; } while (*parser->bufp != '\0') { switch (*parser->bufp) { case '<': if (parser->str->len == 0) html_parse_tag(parser); else return parser->str->str; break; case '&': html_parse_special(parser); break; case ' ': case '\t': case '\r': case '\n': if (parser->bufp[0] == '\r' && parser->bufp[1] == '\n') parser->bufp++; if (!parser->pre) { if (!parser->newline) parser->space = TRUE; parser->bufp++; break; } /* fallthrough */ default: html_append_char(parser, *parser->bufp++); } } return parser->str->str; } static HTMLState html_read_line(HTMLParser *parser) { gchar buf[HTMLBUFSIZE]; gchar buf2[HTMLBUFSIZE]; gint index; if (fgets(buf, sizeof(buf), parser->fp) == NULL) { parser->state = HTML_EOF; return HTML_EOF; } if (conv_convert(parser->conv, buf2, sizeof(buf2), buf) < 0) { g_warning("html_read_line(): code conversion failed\n"); index = parser->bufp - parser->buf->str; g_string_append(parser->buf, buf); parser->bufp = parser->buf->str + index; return HTML_ERR; } index = parser->bufp - parser->buf->str; g_string_append(parser->buf, buf2); parser->bufp = parser->buf->str + index; return HTML_NORMAL; } static void html_append_char(HTMLParser *parser, gchar ch) { GString *str = parser->str; if (!parser->pre && parser->space) { g_string_append_c(str, ' '); parser->space = FALSE; } g_string_append_c(str, ch); parser->empty_line = FALSE; if (ch == '\n') { parser->newline = TRUE; if (str->len > 1 && str->str[str->len - 2] == '\n') parser->empty_line = TRUE; } else parser->newline = FALSE; } static void html_append_str(HTMLParser *parser, const gchar *str, gint len) { GString *string = parser->str; if (!parser->pre && parser->space) { g_string_append_c(string, ' '); parser->space = FALSE; } if (len == 0) return; if (len < 0) g_string_append(string, str); else { gchar *s; Xstrndup_a(s, str, len, return); g_string_append(string, s); } parser->empty_line = FALSE; if (string->len > 0 && string->str[string->len - 1] == '\n') { parser->newline = TRUE; if (string->len > 1 && string->str[string->len - 2] == '\n') parser->empty_line = TRUE; } else parser->newline = FALSE; } static HTMLState html_parse_tag(HTMLParser *parser) { gchar buf[HTMLBUFSIZE]; gchar *p; static gboolean is_in_href = FALSE; html_get_parenthesis(parser, buf, sizeof(buf)); for (p = buf; *p != '\0'; p++) { if (isspace(*p)) { *p = '\0'; break; } } parser->state = HTML_UNKNOWN; if (buf[0] == '\0') return parser->state; g_strdown(buf); if (!strcmp(buf, "br")) { parser->space = FALSE; html_append_char(parser, '\n'); parser->state = HTML_BR; } else if (!strcmp(buf, "a")) { /* look for tokens separated by space or = */ char* href_token = strtok(++p, " ="); parser->state = HTML_NORMAL; while (href_token != NULL) { /* look for href */ if (!strcmp(href_token, "href")) { /* the next token is the url, between double * quotes */ char* url = strtok(NULL, "\""); html_append_str(parser, url, strlen(url)); html_append_char(parser, ' '); /* start enforcing html link */ parser->state = HTML_HREF; is_in_href = TRUE; break; } /* or get next token */ href_token = strtok(NULL, " ="); } } else if (!strcmp(buf, "/a")) { /* stop enforcing html link */ parser->state = HTML_NORMAL; is_in_href = FALSE; } else if (!strcmp(buf, "p")) { parser->space = FALSE; if (!parser->empty_line) { parser->space = FALSE; if (!parser->newline) html_append_char(parser, '\n'); html_append_char(parser, '\n'); } parser->state = HTML_PAR; } else if (!strcmp(buf, "pre")) { parser->pre = TRUE; parser->state = HTML_PRE; } else if (!strcmp(buf, "/pre")) { parser->pre = FALSE; parser->state = HTML_NORMAL; } else if (!strcmp(buf, "hr")) { if (!parser->newline) { parser->space = FALSE; html_append_char(parser, '\n'); } html_append_str(parser, HR_STR "\n", -1); parser->state = HTML_HR; } else if (!strcmp(buf, "div") || !strcmp(buf, "ul") || !strcmp(buf, "li") || !strcmp(buf, "table") || !strcmp(buf, "tr") || (buf[0] == 'h' && isdigit(buf[1]))) { if (!parser->newline) { parser->space = FALSE; html_append_char(parser, '\n'); } parser->state = HTML_NORMAL; } else if (!strcmp(buf, "/table") || (buf[0] == '/' && buf[1] == 'h' && isdigit(buf[1]))) { if (!parser->empty_line) { parser->space = FALSE; if (!parser->newline) html_append_char(parser, '\n'); html_append_char(parser, '\n'); } parser->state = HTML_NORMAL; } else if (!strcmp(buf, "/div") || !strcmp(buf, "/ul") || !strcmp(buf, "/li")) { if (!parser->newline) { parser->space = FALSE; html_append_char(parser, '\n'); } parser->state = HTML_NORMAL; } if (is_in_href == TRUE) { /* when inside a link, everything will be written as * clickable (see textview_show_thml in textview.c) */ parser->state = HTML_HREF; } return parser->state; } static void html_parse_special(HTMLParser *parser) { gchar symbol_name[9]; gint n; const gchar *val; parser->state = HTML_UNKNOWN; g_return_if_fail(*parser->bufp == '&'); /* &foo; */ for (n = 0; parser->bufp[n] != '\0' && parser->bufp[n] != ';'; n++) ; if (n > 7 || parser->bufp[n] != ';') { /* output literal `&' */ html_append_char(parser, *parser->bufp++); parser->state = HTML_NORMAL; return; } strncpy2(symbol_name, parser->bufp, n + 2); parser->bufp += n + 1; if ((val = g_hash_table_lookup(parser->symbol_table, symbol_name)) != NULL) { html_append_str(parser, val, -1); parser->state = HTML_NORMAL; return; } else if (symbol_name[1] == '#' && isdigit(symbol_name[2])) { gint ch; ch = atoi(symbol_name + 2); if ((ch > 0 && ch <= 127) || (ch >= 128 && ch <= 255 && parser->conv->charset == C_ISO_8859_1)) { html_append_char(parser, ch); parser->state = HTML_NORMAL; return; } } html_append_str(parser, symbol_name, -1); } static void html_get_parenthesis(HTMLParser *parser, gchar *buf, gint len) { gchar *p; buf[0] = '\0'; g_return_if_fail(*parser->bufp == '<'); /* ignore comments */ if (!strncmp(parser->bufp, "")) == NULL) if (html_read_line(parser) == HTML_EOF) return; parser->bufp = p + 3; return; } /* because html is not strict regarding case and double-quoting of tags we have to check for both */ /* ignore css stuff */ if (!g_strncasecmp(parser->bufp, "")) == NULL) if (html_read_line(parser) == HTML_EOF) return; parser->bufp = p + 8; return; } /* ignore css stuff with double quotes*/ if (!g_strncasecmp(parser->bufp, "")) == NULL) if (html_read_line(parser) == HTML_EOF) return; parser->bufp = p + 8; return; } /* ignore javascipt stuff */ if (!g_strncasecmp(parser->bufp, "")) == NULL) if (html_read_line(parser) == HTML_EOF) return; parser->bufp = p + 8; return; } /* ignore javascipt stuff with double-quotes */ if (!g_strncasecmp(parser->bufp, "")) == NULL) if (html_read_line(parser) == HTML_EOF) return; parser->bufp = p + 8; return; } parser->bufp++; while ((p = strchr(parser->bufp, '>')) == NULL) if (html_read_line(parser) == HTML_EOF) return; strncpy2(buf, parser->bufp, MIN(p - parser->bufp + 1, len)); parser->bufp = p + 1; } /* these hash functions were taken from gstring.c in glib */ #if 0 static gint g_str_case_equal(gconstpointer v, gconstpointer v2) { return strcasecmp((const gchar *)v, (const gchar *)v2) == 0; } static guint g_str_case_hash(gconstpointer key) { const gchar *p = key; guint h = *p; if (h) { h = tolower(h); for (p += 1; *p != '\0'; p++) h = (h << 5) - h + tolower(*p); } return h; } #endif