728 lines
17 KiB
C
728 lines
17 KiB
C
/*
|
|
* Sylpheed -- a GTK+ based, lightweight, and fast e-mail client
|
|
* Copyright (C) 1999,2000 Hiroyuki Yamamoto
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; either version 2 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, write to the Free Software
|
|
* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
|
|
*/
|
|
|
|
#include <glib.h>
|
|
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <ctype.h>
|
|
|
|
#include "html.h"
|
|
#include "codeconv.h"
|
|
#include "utils.h"
|
|
|
|
#define HTMLBUFSIZE 8192
|
|
#define HR_STR "------------------------------------------------"
|
|
|
|
typedef struct _HTMLSymbol HTMLSymbol;
|
|
|
|
struct _HTMLSymbol
|
|
{
|
|
gchar *const key;
|
|
gchar *const val;
|
|
};
|
|
|
|
static HTMLSymbol symbol_list[] = {
|
|
{"<" , "<"},
|
|
{">" , ">"},
|
|
{"&" , "&"},
|
|
{""" , "\""},
|
|
{" " , " "},
|
|
{"™" , "(TM)"},
|
|
|
|
{"™", "(TM)"},
|
|
};
|
|
|
|
static HTMLSymbol ascii_symbol_list[] = {
|
|
{"¡" , "^!"},
|
|
{"¦", "|"},
|
|
{"©" , "(C)"},
|
|
{"«" , "<<"},
|
|
{"®" , "(R)"},
|
|
|
|
{"²" , "^2"},
|
|
{"³" , "^3"},
|
|
{"´" , "'"},
|
|
{"¸" , ","},
|
|
{"¹" , "^1"},
|
|
{"»" , ">>"},
|
|
{"¼", "1/4"},
|
|
{"½", "1/2"},
|
|
{"¾", "3/4"},
|
|
{"¿", "^?"},
|
|
|
|
{"À", "A`"},
|
|
{"Á", "A'"},
|
|
{"Â" , "A^"},
|
|
{"Ã", "A~"},
|
|
{"Æ" , "AE"},
|
|
{"È", "E`"},
|
|
{"É", "E'"},
|
|
{"Ê" , "E^"},
|
|
{"Ì", "I`"},
|
|
{"Í", "I'"},
|
|
{"Î" , "I^"},
|
|
|
|
{"Ñ", "N~"},
|
|
{"Ò", "O`"},
|
|
{"Ó", "O'"},
|
|
{"Ô" , "O^"},
|
|
{"Õ", "O~"},
|
|
{"Ù", "U`"},
|
|
{"Ú", "U'"},
|
|
{"Û" , "U^"},
|
|
{"Ý", "Y'"},
|
|
|
|
{"à", "a`"},
|
|
{"á", "a'"},
|
|
{"â" , "a^"},
|
|
{"ã", "a~"},
|
|
{"æ" , "ae"},
|
|
{"è", "e`"},
|
|
{"é", "e'"},
|
|
{"ê" , "e^"},
|
|
{"ì", "i`"},
|
|
{"í", "i'"},
|
|
{"î" , "i^"},
|
|
|
|
{"ñ", "n~"},
|
|
{"ò", "o`"},
|
|
{"ó", "o'"},
|
|
{"ô" , "o^"},
|
|
{"õ", "o~"},
|
|
{"ù", "u`"},
|
|
{"ú", "u'"},
|
|
{"û" , "u^"},
|
|
{"ý", "y'"},
|
|
};
|
|
|
|
static HTMLSymbol eucjp_symbol_list[] = {
|
|
{"¡" , "^!"},
|
|
{"¢" , "\xa1\xf1"},
|
|
{"£" , "\xa1\xf2"},
|
|
{"¥" , "\xa1\xef"},
|
|
{"¦", "|"},
|
|
{"§" , "\xa1\xf8"},
|
|
{"¨" , "\xa1\xaf"},
|
|
{"©" , "(C)"},
|
|
{"«" , "<<"},
|
|
{"®" , "(R)"},
|
|
|
|
{"°" , "\xa1\xeb"},
|
|
{"±", "\xa1\xde"},
|
|
{"²" , "^2"},
|
|
{"³" , "^3"},
|
|
{"´" , "'"},
|
|
{"µ" , "\xa6\xcc"},
|
|
{"¶" , "\xa2\xf9"},
|
|
{"·", "\xa1\xa6"},
|
|
{"¸" , ","},
|
|
{"¹" , "^1"},
|
|
{"»" , ">>"},
|
|
{"¼", "1/4"},
|
|
{"½", "1/2"},
|
|
{"¾", "3/4"},
|
|
{"¿", "^?"},
|
|
|
|
{"À", "A`"},
|
|
{"Á", "A'"},
|
|
{"Â" , "A^"},
|
|
{"Ã", "A~"},
|
|
{"Ä" , "A\xa1\xaf"},
|
|
{"Å" , "A\xa1\xeb"},
|
|
{"Æ" , "AE"},
|
|
{"È", "E`"},
|
|
{"É", "E'"},
|
|
{"Ê" , "E^"},
|
|
{"Ë" , "E\xa1\xaf"},
|
|
{"Ì", "I`"},
|
|
{"Í", "I'"},
|
|
{"Î" , "I^"},
|
|
{"Ï" , "I\xa1\xaf"},
|
|
|
|
{"Ñ", "N~"},
|
|
{"Ò", "O`"},
|
|
{"Ó", "O'"},
|
|
{"Ô" , "O^"},
|
|
{"Õ", "O~"},
|
|
{"Ö" , "O\xa1\xaf"},
|
|
{"×" , "\xa1\xdf"},
|
|
{"Ù", "U`"},
|
|
{"Ú", "U'"},
|
|
{"Û" , "U^"},
|
|
{"Ü" , "U\xa1\xaf"},
|
|
{"Ý", "Y'"},
|
|
|
|
{"à", "a`"},
|
|
{"á", "a'"},
|
|
{"â" , "a^"},
|
|
{"ã", "a~"},
|
|
{"ä" , "a\xa1\xaf"},
|
|
{"å" , "a\xa1\xeb"},
|
|
{"æ" , "ae"},
|
|
{"è", "e`"},
|
|
{"é", "e'"},
|
|
{"ê" , "e^"},
|
|
{"ë" , "e\xa1\xaf"},
|
|
{"ì", "i`"},
|
|
{"í", "i'"},
|
|
{"î" , "i^"},
|
|
{"ï" , "i\xa1\xaf"},
|
|
|
|
{"ð" , "\xa2\xdf"},
|
|
{"ñ", "n~"},
|
|
{"ò", "o`"},
|
|
{"ó", "o'"},
|
|
{"ô" , "o^"},
|
|
{"õ", "o~"},
|
|
{"ö" , "o\xa1\xaf"},
|
|
{"÷", "\xa1\xe0"},
|
|
{"ù", "u`"},
|
|
{"ú", "u'"},
|
|
{"û" , "u^"},
|
|
{"ü" , "u\xa1\xaf"},
|
|
{"ý", "y'"},
|
|
{"ÿ" , "y\xa1\xaf"},
|
|
};
|
|
|
|
static HTMLSymbol latin_symbol_list[] = {
|
|
{"¡" , "\xa1"},
|
|
{"¢" , "\xa2"},
|
|
{"£" , "\xa3"},
|
|
{"¤", "\xa4"},
|
|
{"¥" , "\xa5"},
|
|
{"¦", "\xa6"},
|
|
{"§" , "\xa7"},
|
|
{"¨" , "\xa8"},
|
|
{"©" , "\xa9"},
|
|
{"ª" , "\xaa"},
|
|
{"«" , "\xab"},
|
|
{"¬" , "\xac"},
|
|
{"­" , "\xad"},
|
|
{"®" , "\xae"},
|
|
{"¯" , "\xaf"},
|
|
|
|
{"°" , "\xb0"},
|
|
{"±", "\xb1"},
|
|
{"²" , "\xb2"},
|
|
{"³" , "\xb3"},
|
|
{"´" , "\xb4"},
|
|
{"µ" , "\xb5"},
|
|
{"¶" , "\xb6"},
|
|
{"·", "\xb7"},
|
|
{"¸" , "\xb8"},
|
|
{"¹" , "\xb9"},
|
|
{"º" , "\xba"},
|
|
{"»" , "\xbb"},
|
|
{"¼", "\xbc"},
|
|
{"½", "\xbd"},
|
|
{"¾", "\xbe"},
|
|
{"¿", "\xbf"},
|
|
|
|
{"À", "\xc0"},
|
|
{"Á", "\xc1"},
|
|
{"Â" , "\xc2"},
|
|
{"Ã", "\xc3"},
|
|
{"Ä" , "\xc4"},
|
|
{"Å" , "\xc5"},
|
|
{"Æ" , "\xc6"},
|
|
{"Ç", "\xc7"},
|
|
{"È", "\xc8"},
|
|
{"É", "\xc9"},
|
|
{"Ê" , "\xca"},
|
|
{"Ë" , "\xcb"},
|
|
{"Ì", "\xcc"},
|
|
{"Í", "\xcd"},
|
|
{"Î" , "\xce"},
|
|
{"Ï" , "\xcf"},
|
|
|
|
{"Ð" , "\xd0"},
|
|
{"Ñ", "\xd1"},
|
|
{"Ò", "\xd2"},
|
|
{"Ó", "\xd3"},
|
|
{"Ô" , "\xd4"},
|
|
{"Õ", "\xd5"},
|
|
{"Ö" , "\xd6"},
|
|
{"×" , "\xd7"},
|
|
{"Ø", "\xd8"},
|
|
{"Ù", "\xd9"},
|
|
{"Ú", "\xda"},
|
|
{"Û" , "\xdb"},
|
|
{"Ü" , "\xdc"},
|
|
{"Ý", "\xdd"},
|
|
{"Þ" , "\xde"},
|
|
{"ß" , "\xdf"},
|
|
|
|
{"à", "\xe0"},
|
|
{"á", "\xe1"},
|
|
{"â" , "\xe2"},
|
|
{"ã", "\xe3"},
|
|
{"ä" , "\xe4"},
|
|
{"å" , "\xe5"},
|
|
{"æ" , "\xe6"},
|
|
{"ç", "\xe7"},
|
|
{"è", "\xe8"},
|
|
{"é", "\xe9"},
|
|
{"ê" , "\xea"},
|
|
{"ë" , "\xeb"},
|
|
{"ì", "\xec"},
|
|
{"í", "\xed"},
|
|
{"î" , "\xee"},
|
|
{"ï" , "\xef"},
|
|
|
|
{"ð" , "\xf0"},
|
|
{"ñ", "\xf1"},
|
|
{"ò", "\xf2"},
|
|
{"ó", "\xf3"},
|
|
{"ô" , "\xf4"},
|
|
{"õ", "\xf5"},
|
|
{"ö" , "\xf6"},
|
|
{"÷", "\xf7"},
|
|
{"ø", "\xf8"},
|
|
{"ù", "\xf9"},
|
|
{"ú", "\xfa"},
|
|
{"û" , "\xfb"},
|
|
{"ü" , "\xfc"},
|
|
{"ý", "\xfd"},
|
|
{"þ" , "\xfe"},
|
|
{"ÿ" , "\xff"},
|
|
};
|
|
|
|
static GHashTable *default_symbol_table;
|
|
static GHashTable *eucjp_symbol_table;
|
|
static GHashTable *latin_symbol_table;
|
|
|
|
static HTMLState html_read_line (HTMLParser *parser);
|
|
static void html_append_char (HTMLParser *parser,
|
|
gchar ch);
|
|
static void html_append_str (HTMLParser *parser,
|
|
const gchar *str,
|
|
gint len);
|
|
static HTMLState html_parse_tag (HTMLParser *parser);
|
|
static void html_parse_special (HTMLParser *parser);
|
|
static void html_get_parenthesis (HTMLParser *parser,
|
|
gchar *buf,
|
|
gint len);
|
|
|
|
#if 0
|
|
static gint g_str_case_equal (gconstpointer v,
|
|
gconstpointer v2);
|
|
static guint g_str_case_hash (gconstpointer key);
|
|
#endif
|
|
|
|
HTMLParser *html_parser_new(FILE *fp, CodeConverter *conv)
|
|
{
|
|
HTMLParser *parser;
|
|
|
|
g_return_val_if_fail(fp != NULL, NULL);
|
|
g_return_val_if_fail(conv != NULL, NULL);
|
|
|
|
parser = g_new0(HTMLParser, 1);
|
|
parser->fp = fp;
|
|
parser->conv = conv;
|
|
parser->str = g_string_new(NULL);
|
|
parser->buf = g_string_new(NULL);
|
|
parser->bufp = parser->buf->str;
|
|
parser->newline = TRUE;
|
|
parser->empty_line = TRUE;
|
|
parser->space = FALSE;
|
|
parser->pre = FALSE;
|
|
|
|
#define SYMBOL_TABLE_ADD(table, list) \
|
|
{ \
|
|
gint i; \
|
|
\
|
|
for (i = 0; i < sizeof(list) / sizeof(list[0]); i++) \
|
|
g_hash_table_insert(table, list[i].key, list[i].val); \
|
|
}
|
|
|
|
if (!default_symbol_table) {
|
|
default_symbol_table =
|
|
g_hash_table_new(g_str_hash, g_str_equal);
|
|
SYMBOL_TABLE_ADD(default_symbol_table, symbol_list);
|
|
SYMBOL_TABLE_ADD(default_symbol_table, ascii_symbol_list);
|
|
}
|
|
if (!eucjp_symbol_table) {
|
|
eucjp_symbol_table =
|
|
g_hash_table_new(g_str_hash, g_str_equal);
|
|
SYMBOL_TABLE_ADD(eucjp_symbol_table, symbol_list);
|
|
SYMBOL_TABLE_ADD(eucjp_symbol_table, eucjp_symbol_list);
|
|
}
|
|
if (!latin_symbol_table) {
|
|
latin_symbol_table =
|
|
g_hash_table_new(g_str_hash, g_str_equal);
|
|
SYMBOL_TABLE_ADD(latin_symbol_table, symbol_list);
|
|
SYMBOL_TABLE_ADD(latin_symbol_table, latin_symbol_list);
|
|
}
|
|
|
|
#undef SYMBOL_TABLE_ADD
|
|
|
|
if (conv->charset == C_ISO_8859_1)
|
|
parser->symbol_table = latin_symbol_table;
|
|
else if ((conv->charset == C_ISO_2022_JP ||
|
|
conv->charset == C_ISO_2022_JP_2 ||
|
|
conv->charset == C_EUC_JP ||
|
|
conv->charset == C_SHIFT_JIS) &&
|
|
conv_get_current_charset() == C_EUC_JP)
|
|
parser->symbol_table = eucjp_symbol_table;
|
|
else
|
|
parser->symbol_table = default_symbol_table;
|
|
|
|
return parser;
|
|
}
|
|
|
|
void html_parser_destroy(HTMLParser *parser)
|
|
{
|
|
g_string_free(parser->str, TRUE);
|
|
g_string_free(parser->buf, TRUE);
|
|
g_free(parser);
|
|
}
|
|
|
|
gchar *html_parse(HTMLParser *parser)
|
|
{
|
|
parser->state = HTML_NORMAL;
|
|
g_string_truncate(parser->str, 0);
|
|
|
|
if (*parser->bufp == '\0') {
|
|
g_string_truncate(parser->buf, 0);
|
|
parser->bufp = parser->buf->str;
|
|
if (html_read_line(parser) == HTML_EOF)
|
|
return NULL;
|
|
}
|
|
|
|
while (*parser->bufp != '\0') {
|
|
switch (*parser->bufp) {
|
|
case '<':
|
|
if (parser->str->len == 0)
|
|
html_parse_tag(parser);
|
|
else
|
|
return parser->str->str;
|
|
break;
|
|
case '&':
|
|
html_parse_special(parser);
|
|
break;
|
|
case ' ':
|
|
case '\t':
|
|
case '\r':
|
|
case '\n':
|
|
if (parser->bufp[0] == '\r' && parser->bufp[1] == '\n')
|
|
parser->bufp++;
|
|
|
|
if (!parser->pre) {
|
|
if (!parser->newline)
|
|
parser->space = TRUE;
|
|
|
|
parser->bufp++;
|
|
break;
|
|
}
|
|
/* fallthrough */
|
|
default:
|
|
html_append_char(parser, *parser->bufp++);
|
|
}
|
|
}
|
|
|
|
return parser->str->str;
|
|
}
|
|
|
|
static HTMLState html_read_line(HTMLParser *parser)
|
|
{
|
|
gchar buf[HTMLBUFSIZE];
|
|
gchar buf2[HTMLBUFSIZE];
|
|
gint index;
|
|
|
|
if (fgets(buf, sizeof(buf), parser->fp) == NULL) {
|
|
parser->state = HTML_EOF;
|
|
return HTML_EOF;
|
|
}
|
|
|
|
if (conv_convert(parser->conv, buf2, sizeof(buf2), buf) < 0) {
|
|
g_warning("html_read_line(): code conversion failed\n");
|
|
|
|
index = parser->bufp - parser->buf->str;
|
|
|
|
g_string_append(parser->buf, buf);
|
|
|
|
parser->bufp = parser->buf->str + index;
|
|
|
|
return HTML_ERR;
|
|
}
|
|
|
|
index = parser->bufp - parser->buf->str;
|
|
|
|
g_string_append(parser->buf, buf2);
|
|
|
|
parser->bufp = parser->buf->str + index;
|
|
|
|
return HTML_NORMAL;
|
|
}
|
|
|
|
static void html_append_char(HTMLParser *parser, gchar ch)
|
|
{
|
|
GString *str = parser->str;
|
|
|
|
if (!parser->pre && parser->space) {
|
|
g_string_append_c(str, ' ');
|
|
parser->space = FALSE;
|
|
}
|
|
|
|
g_string_append_c(str, ch);
|
|
|
|
parser->empty_line = FALSE;
|
|
if (ch == '\n') {
|
|
parser->newline = TRUE;
|
|
if (str->len > 1 && str->str[str->len - 2] == '\n')
|
|
parser->empty_line = TRUE;
|
|
} else
|
|
parser->newline = FALSE;
|
|
}
|
|
|
|
static void html_append_str(HTMLParser *parser, const gchar *str, gint len)
|
|
{
|
|
GString *string = parser->str;
|
|
|
|
if (!parser->pre && parser->space) {
|
|
g_string_append_c(string, ' ');
|
|
parser->space = FALSE;
|
|
}
|
|
|
|
if (len == 0) return;
|
|
if (len < 0)
|
|
g_string_append(string, str);
|
|
else {
|
|
gchar *s;
|
|
Xstrndup_a(s, str, len, return);
|
|
g_string_append(string, s);
|
|
}
|
|
|
|
parser->empty_line = FALSE;
|
|
if (string->len > 0 && string->str[string->len - 1] == '\n') {
|
|
parser->newline = TRUE;
|
|
if (string->len > 1 && string->str[string->len - 2] == '\n')
|
|
parser->empty_line = TRUE;
|
|
} else
|
|
parser->newline = FALSE;
|
|
}
|
|
|
|
static HTMLState html_parse_tag(HTMLParser *parser)
|
|
{
|
|
gchar buf[HTMLBUFSIZE];
|
|
gchar *p;
|
|
static gboolean is_in_href = FALSE;
|
|
|
|
html_get_parenthesis(parser, buf, sizeof(buf));
|
|
|
|
for (p = buf; *p != '\0'; p++) {
|
|
if (isspace(*p)) {
|
|
*p = '\0';
|
|
break;
|
|
}
|
|
}
|
|
|
|
parser->state = HTML_UNKNOWN;
|
|
if (buf[0] == '\0') return parser->state;
|
|
|
|
g_strdown(buf);
|
|
|
|
if (!strcmp(buf, "br")) {
|
|
parser->space = FALSE;
|
|
html_append_char(parser, '\n');
|
|
parser->state = HTML_BR;
|
|
} else if (!strcmp(buf, "a")) {
|
|
/* look for tokens separated by space or = */
|
|
char* href_token = strtok(++p, " =");
|
|
parser->state = HTML_NORMAL;
|
|
while (href_token != NULL) {
|
|
/* look for href */
|
|
if (!strcmp(href_token, "href")) {
|
|
/* the next token is the url, between double
|
|
* quotes */
|
|
char* url = strtok(NULL, "\"");
|
|
html_append_str(parser, url, strlen(url));
|
|
html_append_char(parser, ' ');
|
|
/* start enforcing html link */
|
|
parser->state = HTML_HREF;
|
|
is_in_href = TRUE;
|
|
break;
|
|
}
|
|
/* or get next token */
|
|
href_token = strtok(NULL, " =");
|
|
}
|
|
} else if (!strcmp(buf, "/a")) {
|
|
/* stop enforcing html link */
|
|
parser->state = HTML_NORMAL;
|
|
is_in_href = FALSE;
|
|
} else if (!strcmp(buf, "p")) {
|
|
parser->space = FALSE;
|
|
if (!parser->empty_line) {
|
|
parser->space = FALSE;
|
|
if (!parser->newline) html_append_char(parser, '\n');
|
|
html_append_char(parser, '\n');
|
|
}
|
|
parser->state = HTML_PAR;
|
|
} else if (!strcmp(buf, "pre")) {
|
|
parser->pre = TRUE;
|
|
parser->state = HTML_PRE;
|
|
} else if (!strcmp(buf, "/pre")) {
|
|
parser->pre = FALSE;
|
|
parser->state = HTML_NORMAL;
|
|
} else if (!strcmp(buf, "hr")) {
|
|
if (!parser->newline) {
|
|
parser->space = FALSE;
|
|
html_append_char(parser, '\n');
|
|
}
|
|
html_append_str(parser, HR_STR "\n", -1);
|
|
parser->state = HTML_HR;
|
|
} else if (!strcmp(buf, "div") ||
|
|
!strcmp(buf, "ul") ||
|
|
!strcmp(buf, "li") ||
|
|
!strcmp(buf, "table") ||
|
|
!strcmp(buf, "tr") ||
|
|
(buf[0] == 'h' && isdigit(buf[1]))) {
|
|
if (!parser->newline) {
|
|
parser->space = FALSE;
|
|
html_append_char(parser, '\n');
|
|
}
|
|
parser->state = HTML_NORMAL;
|
|
} else if (!strcmp(buf, "/table") ||
|
|
(buf[0] == '/' && buf[1] == 'h' && isdigit(buf[1]))) {
|
|
if (!parser->empty_line) {
|
|
parser->space = FALSE;
|
|
if (!parser->newline) html_append_char(parser, '\n');
|
|
html_append_char(parser, '\n');
|
|
}
|
|
parser->state = HTML_NORMAL;
|
|
} else if (!strcmp(buf, "/div") ||
|
|
!strcmp(buf, "/ul") ||
|
|
!strcmp(buf, "/li")) {
|
|
if (!parser->newline) {
|
|
parser->space = FALSE;
|
|
html_append_char(parser, '\n');
|
|
}
|
|
parser->state = HTML_NORMAL;
|
|
}
|
|
|
|
if (is_in_href == TRUE) {
|
|
/* when inside a link, everything will be written as
|
|
* clickable (see textview_show_thml in textview.c) */
|
|
parser->state = HTML_HREF;
|
|
}
|
|
|
|
return parser->state;
|
|
}
|
|
|
|
static void html_parse_special(HTMLParser *parser)
|
|
{
|
|
gchar symbol_name[9];
|
|
gint n;
|
|
const gchar *val;
|
|
|
|
parser->state = HTML_UNKNOWN;
|
|
g_return_if_fail(*parser->bufp == '&');
|
|
|
|
/* &foo; */
|
|
for (n = 0; parser->bufp[n] != '\0' && parser->bufp[n] != ';'; n++)
|
|
;
|
|
if (n > 7 || parser->bufp[n] != ';') {
|
|
/* output literal `&' */
|
|
html_append_char(parser, *parser->bufp++);
|
|
parser->state = HTML_NORMAL;
|
|
return;
|
|
}
|
|
strncpy2(symbol_name, parser->bufp, n + 2);
|
|
parser->bufp += n + 1;
|
|
|
|
if ((val = g_hash_table_lookup(parser->symbol_table, symbol_name))
|
|
!= NULL) {
|
|
html_append_str(parser, val, -1);
|
|
parser->state = HTML_NORMAL;
|
|
return;
|
|
} else if (symbol_name[1] == '#' && isdigit(symbol_name[2])) {
|
|
gint ch;
|
|
|
|
ch = atoi(symbol_name + 2);
|
|
if ((ch > 0 && ch <= 127) ||
|
|
(ch >= 128 && ch <= 255 &&
|
|
parser->conv->charset == C_ISO_8859_1)) {
|
|
html_append_char(parser, ch);
|
|
parser->state = HTML_NORMAL;
|
|
return;
|
|
}
|
|
}
|
|
|
|
html_append_str(parser, symbol_name, -1);
|
|
}
|
|
|
|
static void html_get_parenthesis(HTMLParser *parser, gchar *buf, gint len)
|
|
{
|
|
gchar *p;
|
|
|
|
buf[0] = '\0';
|
|
g_return_if_fail(*parser->bufp == '<');
|
|
|
|
/* ignore comment / CSS / script stuff */
|
|
if (!strncmp(parser->bufp, "<!--", 4)) {
|
|
parser->bufp += 4;
|
|
while ((p = strstr(parser->bufp, "-->")) == NULL)
|
|
if (html_read_line(parser) == HTML_EOF) return;
|
|
parser->bufp = p + 3;
|
|
return;
|
|
}
|
|
if (!g_strncasecmp(parser->bufp, "<style", 6)) {
|
|
parser->bufp += 6;
|
|
while ((p = strcasestr(parser->bufp, "</style>")) == NULL)
|
|
if (html_read_line(parser) == HTML_EOF) return;
|
|
parser->bufp = p + 8;
|
|
return;
|
|
}
|
|
if (!g_strncasecmp(parser->bufp, "<script", 7)) {
|
|
parser->bufp += 7;
|
|
while ((p = strcasestr(parser->bufp, "</script>")) == NULL)
|
|
if (html_read_line(parser) == HTML_EOF) return;
|
|
parser->bufp = p + 9;
|
|
return;
|
|
}
|
|
|
|
parser->bufp++;
|
|
while ((p = strchr(parser->bufp, '>')) == NULL)
|
|
if (html_read_line(parser) == HTML_EOF) return;
|
|
|
|
strncpy2(buf, parser->bufp, MIN(p - parser->bufp + 1, len));
|
|
parser->bufp = p + 1;
|
|
}
|
|
|
|
/* these hash functions were taken from gstring.c in glib */
|
|
#if 0
|
|
static gint g_str_case_equal(gconstpointer v, gconstpointer v2)
|
|
{
|
|
return strcasecmp((const gchar *)v, (const gchar *)v2) == 0;
|
|
}
|
|
|
|
static guint g_str_case_hash(gconstpointer key)
|
|
{
|
|
const gchar *p = key;
|
|
guint h = *p;
|
|
|
|
if (h) {
|
|
h = tolower(h);
|
|
for (p += 1; *p != '\0'; p++)
|
|
h = (h << 5) - h + tolower(*p);
|
|
}
|
|
|
|
return h;
|
|
}
|
|
#endif
|