freebsd-ports/www/webalizer/files/webalizer-a-urasim_2.patch
2008-07-17 14:08:56 +00:00

232 lines
6 KiB
Diff

--- webalizer.c.orig 2008-07-02 00:23:43.000000000 +0200
+++ webalizer.c 2008-07-05 13:45:11.000000000 +0200
@@ -36,6 +36,7 @@
#include <sys/utsname.h>
#include <zlib.h>
#include <sys/stat.h>
+#include <iconv.h>
/* ensure getopt */
#ifdef HAVE_GETOPT_H
@@ -255,6 +256,8 @@
char pie_color3[] = "#ff00ff"; /* pie additionnal color 3 */
char pie_color4[] = "#ffc080"; /* pie additionnal color 4 */
+iconv_t cd_from_sjis, cd_from_utf8;
+
/*********************************************/
/* MAIN - start here */
/*********************************************/
@@ -661,6 +664,9 @@
/* get processing start time */
start_time = time(NULL);
+ cd_from_sjis = iconv_open("EUC-JP", "Shift_JIS");
+ cd_from_utf8 = iconv_open("EUC-JP", "UTF-8");
+
/*********************************************/
/* MAIN PROCESS LOOP - read through log file */
/*********************************************/
@@ -1477,6 +1483,9 @@
if (geo_fp) GeoIP_delete(geo_fp);
#endif
+ iconv_close(cd_from_sjis);
+ iconv_close(cd_from_utf8);
+
/* Whew, all done! Exit with completion status (0) */
exit(0);
}
@@ -2079,6 +2088,23 @@
if (!str) return NULL; /* make sure strings valid */
+ while(*cp1){ /* for apache log's escape code. */
+ if(*cp1 == '\\' && *(cp1+1) == 'x' &&
+ isxdigit(*(cp1+2)) && isxdigit(*(cp1+3))){
+ *cp2 = from_hex(*(cp1+2))*16 + from_hex(*(cp1+3));
+ if ((*cp2<32)||(*cp2==127)) *cp2='_';
+ cp1+=4; cp2++;
+
+ }
+ else if(*cp1 == '\\' && *(cp1+1) == '\\'){
+ *cp2++='\\';
+ cp1+=2;
+ }
+ else *cp2++ = *cp1++;
+ }
+ *cp2=*cp1;
+
+ cp1=cp2=str;
while (*cp1)
{
if (*cp1=='%') /* Found an escape? */
@@ -2111,6 +2137,116 @@
if (*str1==0) return 0; else return 1;
}
+int score_eucj(unsigned char *str)
+{
+ int stat=0;
+ int score=0;
+ int bad=0;
+ if(str==NULL) return -1;
+
+ for(; *str!=0;str++){
+ switch(stat){
+ case 0:
+ if(*str>= 0x20 && *str <= 0x7e) score++; //ASCII
+ else if(*str >= 0xa1 && *str <= 0xfe) stat=1; //KANJI(1)
+ else if(*str == 0x8f); // HOJYO KANJI
+ else if(*str == 0x8e) stat=2; // KANA
+ else if(*str < 0x20); //CTRL
+ else bad=1;
+ break;
+ case 1:
+ if(*str >= 0xa1 && *str <= 0xfe) score += 2; //KANJI(2)
+ else bad=1;
+ stat=0;
+ break;
+ case 2:
+ if(*str >= 0xa1 && *str <= 0xdf); //hankaku <- 0
+ else bad=1;
+ stat=0;
+ break;
+ }
+ }
+ if(bad != 0) score = -1;
+ return score;
+}
+
+int score_sjis(unsigned char *str)
+{
+ int stat=0;
+ int score=0;
+ int bad=0;
+ if(str==NULL) return -1;
+
+ for(; *str != 0; str++){
+ switch(stat){
+ case 0:
+ if(*str>= 0x20 && *str <= 0x7e) score++;//ASCII
+ else if((*str >= 0x81 && *str <= 0x9f) ||
+ (*str >= 0xe0 && *str <= 0xfc)) stat=1; //SJIS(1)
+ else if(*str >= 0xa1 && *str <= 0xdf); // KANA
+ else if(*str < 0x20); // CTRL
+ else bad=1;
+ break;
+ case 1:
+ if((*str >= 0x40 && *str <= 0x7e) ||
+ (*str >= 0x80 && *str <= 0xfc)) score += 2; //SJIS(2)
+ else bad=1;
+ stat=0;
+ break;
+ }
+ }
+ if(bad != 0) score = -1;
+ return score;
+}
+
+int score_utf8(unsigned char *str)
+{
+ int stat=0;
+ int score=0;
+ int bad=0;
+ if(str==NULL) return -1;
+
+ for(; *str != 0; str++){
+ switch(stat){
+ case 0:
+ if(*str>= 0x20 && *str <= 0x7e) score++; //ASCII
+ else if(*str >= 0xc0 && *str <= 0xdf) stat=1; //greek etc.
+ else if(*str >= 0xe0 && *str <= 0xef) stat=2; //KANJI etc.
+ else if(*str >= 0xf0 && *str <= 0xf7) stat=4;
+ else if(*str < 0x20); //CTRL
+ else bad=1;
+ break;
+ case 1:
+ if(*str >= 0x80 && *str <= 0xbf) score++;
+ else bad=1;
+ stat=0;
+ break;
+ case 2:
+ if(*str >= 0x80 && *str <= 0xbf) stat=3; //KANJI(2)
+ else {bad=1; stat=0;}
+ break;
+ case 3:
+ if(*str >= 0x80 && *str <= 0xbf) score+=3; //KANJI(3)
+ else bad=1;
+ stat=0;
+ break;
+ case 4:
+ case 5:
+ if(*str >= 0x80 && *str <= 0xbf) stat++;
+ else {bad=1; stat=0;}
+ break;
+ case 6:
+ if(*str >= 0x80 && *str <= 0xbf) score+=4;
+ else bad=1;
+ stat=0;
+ break;
+ }
+ }
+ if(bad != 0) score = -1;
+ return score;
+}
+
+
/*********************************************/
/* SRCH_STRING - get search strings from ref */
/*********************************************/
@@ -2122,6 +2258,10 @@
char srch[80]="";
unsigned char *cp1, *cp2, *cps;
int sp_flg=0;
+ int sjis, eucj, utf8;
+ char tmpbuf2[BUFSIZE];
+ size_t inlen, outlen;
+ unsigned char *cp3;
/* Check if search engine referrer or return */
if ( (cps=(unsigned char *)isinglist(search_list,log_rec.refer))==NULL)
@@ -2160,9 +2300,39 @@
cp1=cp2+strlen((char *)cp2)-1;
while (cp1!=cp2) if (isspace((unsigned char)*cp1)) *cp1--='\0'; else break;
+ utf8=score_utf8(cp2);
+ sjis=score_sjis(cp2);
+ eucj=score_eucj(cp2);
+ if(utf8 >= sjis && utf8 >= eucj){
+ iconv(cd_from_utf8, NULL, 0, NULL, 0);
+ cp3 = cp2;
+ inlen = strlen(cp2)+1;
+ cp1 = tmpbuf2;
+ outlen = sizeof(tmpbuf2);
+ if(iconv(cd_from_utf8, (char **)&cp3, &inlen, (char**)&cp1, &outlen) >= 0 &&
+ inlen == 0){
+ cp2 = tmpbuf2;
+ }
+ }
+ else if(sjis > utf8 && sjis > eucj){
+ iconv(cd_from_sjis, NULL, 0, NULL, 0);
+ cp3 = cp2;
+ inlen = strlen(cp2)+1;
+ cp1 = tmpbuf2;
+ outlen = sizeof(tmpbuf2);
+ if(iconv(cd_from_sjis, (char **)&cp3, &inlen, (char**)&cp1, &outlen) >= 0 &&
+ inlen == 0){
+ cp2 = tmpbuf2;
+ }
+ }
+
/* strip invalid chars */
cp1=cp2;
- while (*cp1!=0) { if ((*cp1<32)||(*cp1==127)) *cp1='_'; cp1++; }
+ while (*cp1!=0) {
+ if ((*cp1<32)||(*cp1==127)) *cp1='_';
+ *cp1=tolower(*cp1);
+ cp1++;
+ }
if (put_snode((char *)cp2,(u_int64_t)1,sr_htab))
{