232 lines
6 KiB
Diff
232 lines
6 KiB
Diff
--- webalizer.c.orig 2008-07-02 00:23:43.000000000 +0200
|
|
+++ webalizer.c 2008-07-05 13:45:11.000000000 +0200
|
|
@@ -36,6 +36,7 @@
|
|
#include <sys/utsname.h>
|
|
#include <zlib.h>
|
|
#include <sys/stat.h>
|
|
+#include <iconv.h>
|
|
|
|
/* ensure getopt */
|
|
#ifdef HAVE_GETOPT_H
|
|
@@ -255,6 +256,8 @@
|
|
char pie_color3[] = "#ff00ff"; /* pie additionnal color 3 */
|
|
char pie_color4[] = "#ffc080"; /* pie additionnal color 4 */
|
|
|
|
+iconv_t cd_from_sjis, cd_from_utf8;
|
|
+
|
|
/*********************************************/
|
|
/* MAIN - start here */
|
|
/*********************************************/
|
|
@@ -661,6 +664,9 @@
|
|
/* get processing start time */
|
|
start_time = time(NULL);
|
|
|
|
+ cd_from_sjis = iconv_open("EUC-JP", "Shift_JIS");
|
|
+ cd_from_utf8 = iconv_open("EUC-JP", "UTF-8");
|
|
+
|
|
/*********************************************/
|
|
/* MAIN PROCESS LOOP - read through log file */
|
|
/*********************************************/
|
|
@@ -1477,6 +1483,9 @@
|
|
if (geo_fp) GeoIP_delete(geo_fp);
|
|
#endif
|
|
|
|
+ iconv_close(cd_from_sjis);
|
|
+ iconv_close(cd_from_utf8);
|
|
+
|
|
/* Whew, all done! Exit with completion status (0) */
|
|
exit(0);
|
|
}
|
|
@@ -2079,6 +2088,23 @@
|
|
|
|
if (!str) return NULL; /* make sure strings valid */
|
|
|
|
+ while(*cp1){ /* for apache log's escape code. */
|
|
+ if(*cp1 == '\\' && *(cp1+1) == 'x' &&
|
|
+ isxdigit(*(cp1+2)) && isxdigit(*(cp1+3))){
|
|
+ *cp2 = from_hex(*(cp1+2))*16 + from_hex(*(cp1+3));
|
|
+ if ((*cp2<32)||(*cp2==127)) *cp2='_';
|
|
+ cp1+=4; cp2++;
|
|
+
|
|
+ }
|
|
+ else if(*cp1 == '\\' && *(cp1+1) == '\\'){
|
|
+ *cp2++='\\';
|
|
+ cp1+=2;
|
|
+ }
|
|
+ else *cp2++ = *cp1++;
|
|
+ }
|
|
+ *cp2=*cp1;
|
|
+
|
|
+ cp1=cp2=str;
|
|
while (*cp1)
|
|
{
|
|
if (*cp1=='%') /* Found an escape? */
|
|
@@ -2111,6 +2137,116 @@
|
|
if (*str1==0) return 0; else return 1;
|
|
}
|
|
|
|
+int score_eucj(unsigned char *str)
|
|
+{
|
|
+ int stat=0;
|
|
+ int score=0;
|
|
+ int bad=0;
|
|
+ if(str==NULL) return -1;
|
|
+
|
|
+ for(; *str!=0;str++){
|
|
+ switch(stat){
|
|
+ case 0:
|
|
+ if(*str>= 0x20 && *str <= 0x7e) score++; //ASCII
|
|
+ else if(*str >= 0xa1 && *str <= 0xfe) stat=1; //KANJI(1)
|
|
+ else if(*str == 0x8f); // HOJYO KANJI
|
|
+ else if(*str == 0x8e) stat=2; // KANA
|
|
+ else if(*str < 0x20); //CTRL
|
|
+ else bad=1;
|
|
+ break;
|
|
+ case 1:
|
|
+ if(*str >= 0xa1 && *str <= 0xfe) score += 2; //KANJI(2)
|
|
+ else bad=1;
|
|
+ stat=0;
|
|
+ break;
|
|
+ case 2:
|
|
+ if(*str >= 0xa1 && *str <= 0xdf); //hankaku <- 0
|
|
+ else bad=1;
|
|
+ stat=0;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ if(bad != 0) score = -1;
|
|
+ return score;
|
|
+}
|
|
+
|
|
+int score_sjis(unsigned char *str)
|
|
+{
|
|
+ int stat=0;
|
|
+ int score=0;
|
|
+ int bad=0;
|
|
+ if(str==NULL) return -1;
|
|
+
|
|
+ for(; *str != 0; str++){
|
|
+ switch(stat){
|
|
+ case 0:
|
|
+ if(*str>= 0x20 && *str <= 0x7e) score++;//ASCII
|
|
+ else if((*str >= 0x81 && *str <= 0x9f) ||
|
|
+ (*str >= 0xe0 && *str <= 0xfc)) stat=1; //SJIS(1)
|
|
+ else if(*str >= 0xa1 && *str <= 0xdf); // KANA
|
|
+ else if(*str < 0x20); // CTRL
|
|
+ else bad=1;
|
|
+ break;
|
|
+ case 1:
|
|
+ if((*str >= 0x40 && *str <= 0x7e) ||
|
|
+ (*str >= 0x80 && *str <= 0xfc)) score += 2; //SJIS(2)
|
|
+ else bad=1;
|
|
+ stat=0;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ if(bad != 0) score = -1;
|
|
+ return score;
|
|
+}
|
|
+
|
|
+int score_utf8(unsigned char *str)
|
|
+{
|
|
+ int stat=0;
|
|
+ int score=0;
|
|
+ int bad=0;
|
|
+ if(str==NULL) return -1;
|
|
+
|
|
+ for(; *str != 0; str++){
|
|
+ switch(stat){
|
|
+ case 0:
|
|
+ if(*str>= 0x20 && *str <= 0x7e) score++; //ASCII
|
|
+ else if(*str >= 0xc0 && *str <= 0xdf) stat=1; //greek etc.
|
|
+ else if(*str >= 0xe0 && *str <= 0xef) stat=2; //KANJI etc.
|
|
+ else if(*str >= 0xf0 && *str <= 0xf7) stat=4;
|
|
+ else if(*str < 0x20); //CTRL
|
|
+ else bad=1;
|
|
+ break;
|
|
+ case 1:
|
|
+ if(*str >= 0x80 && *str <= 0xbf) score++;
|
|
+ else bad=1;
|
|
+ stat=0;
|
|
+ break;
|
|
+ case 2:
|
|
+ if(*str >= 0x80 && *str <= 0xbf) stat=3; //KANJI(2)
|
|
+ else {bad=1; stat=0;}
|
|
+ break;
|
|
+ case 3:
|
|
+ if(*str >= 0x80 && *str <= 0xbf) score+=3; //KANJI(3)
|
|
+ else bad=1;
|
|
+ stat=0;
|
|
+ break;
|
|
+ case 4:
|
|
+ case 5:
|
|
+ if(*str >= 0x80 && *str <= 0xbf) stat++;
|
|
+ else {bad=1; stat=0;}
|
|
+ break;
|
|
+ case 6:
|
|
+ if(*str >= 0x80 && *str <= 0xbf) score+=4;
|
|
+ else bad=1;
|
|
+ stat=0;
|
|
+ break;
|
|
+ }
|
|
+ }
|
|
+ if(bad != 0) score = -1;
|
|
+ return score;
|
|
+}
|
|
+
|
|
+
|
|
/*********************************************/
|
|
/* SRCH_STRING - get search strings from ref */
|
|
/*********************************************/
|
|
@@ -2122,6 +2258,10 @@
|
|
char srch[80]="";
|
|
unsigned char *cp1, *cp2, *cps;
|
|
int sp_flg=0;
|
|
+ int sjis, eucj, utf8;
|
|
+ char tmpbuf2[BUFSIZE];
|
|
+ size_t inlen, outlen;
|
|
+ unsigned char *cp3;
|
|
|
|
/* Check if search engine referrer or return */
|
|
if ( (cps=(unsigned char *)isinglist(search_list,log_rec.refer))==NULL)
|
|
@@ -2160,9 +2300,39 @@
|
|
cp1=cp2+strlen((char *)cp2)-1;
|
|
while (cp1!=cp2) if (isspace((unsigned char)*cp1)) *cp1--='\0'; else break;
|
|
|
|
+ utf8=score_utf8(cp2);
|
|
+ sjis=score_sjis(cp2);
|
|
+ eucj=score_eucj(cp2);
|
|
+ if(utf8 >= sjis && utf8 >= eucj){
|
|
+ iconv(cd_from_utf8, NULL, 0, NULL, 0);
|
|
+ cp3 = cp2;
|
|
+ inlen = strlen(cp2)+1;
|
|
+ cp1 = tmpbuf2;
|
|
+ outlen = sizeof(tmpbuf2);
|
|
+ if(iconv(cd_from_utf8, (char **)&cp3, &inlen, (char**)&cp1, &outlen) >= 0 &&
|
|
+ inlen == 0){
|
|
+ cp2 = tmpbuf2;
|
|
+ }
|
|
+ }
|
|
+ else if(sjis > utf8 && sjis > eucj){
|
|
+ iconv(cd_from_sjis, NULL, 0, NULL, 0);
|
|
+ cp3 = cp2;
|
|
+ inlen = strlen(cp2)+1;
|
|
+ cp1 = tmpbuf2;
|
|
+ outlen = sizeof(tmpbuf2);
|
|
+ if(iconv(cd_from_sjis, (char **)&cp3, &inlen, (char**)&cp1, &outlen) >= 0 &&
|
|
+ inlen == 0){
|
|
+ cp2 = tmpbuf2;
|
|
+ }
|
|
+ }
|
|
+
|
|
/* strip invalid chars */
|
|
cp1=cp2;
|
|
- while (*cp1!=0) { if ((*cp1<32)||(*cp1==127)) *cp1='_'; cp1++; }
|
|
+ while (*cp1!=0) {
|
|
+ if ((*cp1<32)||(*cp1==127)) *cp1='_';
|
|
+ *cp1=tolower(*cp1);
|
|
+ cp1++;
|
|
+ }
|
|
|
|
if (put_snode((char *)cp2,(u_int64_t)1,sr_htab))
|
|
{
|