#include "mendex.h" #include #include #include "exkana.h" #include "exhanzi.h" #include "exvar.h" int line_length=0; static void printpage(struct index *ind, FILE *fp, int num, char *lbuff); static int range_check(struct index ind, int count, char *lbuff, FILE *fp); static void linecheck(char *lbuff, char *tmpbuff, FILE *fp, int force); static void crcheck(char *lbuff, FILE *fp); static void index_normalize(UChar *istr, UChar *ini, int *chset); static int initial_cmp_char(UChar *ini, UChar *ch); static int init_hanzi_header(void); static const UNormalizer2 *unormalizer_NFD, *unormalizer_NFKD; static int turkish_i; #define M_NONE 0 #define M_TO_UPPER 1 #define M_TO_TITLE 2 #define M_TO_LOWER -1 #define CHOSEONG_KIYEOK 0x1100 /* All buffers have size BUFFERLEN. */ #define BUFFERLEN 4096 #ifdef HAVE___VA_ARGS__ /* Use C99 variadic macros if they are supported. */ /* We would like to use sizeof(buf) instead of BUFFERLEN but that fails for, e.g., gcc-4.8.3 on Cygwin and gcc-4.5.3 on NetBSD. */ #define SPRINTF(buf, ...) \ snprintf(buf, BUFFERLEN, __VA_ARGS__) #define SAPPENDF(buf, ...) \ snprintf(buf + strlen(buf), BUFFERLEN - strlen(buf), __VA_ARGS__) #else /* Alternatively use static inline functions. */ static inline int SPRINTF(char *buf, const char *format, ...) { va_list argptr; int n; va_start(argptr, format); n = vsnprintf(buf, BUFFERLEN, format, argptr); va_end(argptr); return n; } static inline int SAPPENDF(char *buf, const char *format, ...) { va_list argptr; int n; va_start(argptr, format); n = vsnprintf(buf + strlen(buf), BUFFERLEN - strlen(buf), format, argptr); va_end(argptr); return n; } #endif static void fprint_uchar(FILE *fp, const UChar *a, const int mode, const int len) { char str[3*INITIALLENGTH+1]; UChar istr[INITIALLENGTH]; int olen, wclen; UErrorCode perr; if (len<0) { u_strcpy(istr,a); wclen=u_strlen(istr); } else { wclen=is_surrogate_pair(a) ? 2 : 1; istr[0]=a[0]; if (wclen==2) istr[1]=a[1]; istr[wclen]=L'\0'; } if (mode==M_TO_UPPER) { perr=U_ZERO_ERROR; olen=u_strToUpper(istr,INITIALLENGTH,istr,wclen,"",&perr); } else if (mode==M_TO_LOWER) { perr=U_ZERO_ERROR; olen=u_strToLower(istr,INITIALLENGTH,istr,wclen, istr[0]==0x130&&turkish_i==2?"tr":"", &perr); } else if (mode==M_TO_TITLE) { perr=U_ZERO_ERROR; olen=u_strToTitle(istr,INITIALLENGTH,istr,wclen,NULL,"",&perr); } else olen=wclen; if (olen>INITIALLENGTH) { warn_printf(efp, "\nWarning: Too long (%d) header.\n", olen); wclen=INITIALLENGTH; } else wclen=olen; perr=U_ZERO_ERROR; u_strToUTF8(str, 3*INITIALLENGTH+1, &olen, istr, wclen, &perr); fprintf(fp,"%s",str); } #ifdef WIN32 /* fprintf with convert kanji code */ int fprintf2(FILE *fp, const char *format, ...) { char print_buff[8000]; va_list argptr; int n; va_start(argptr, format); n = vsnprintf(print_buff, sizeof print_buff, format, argptr); va_end(argptr); fputs(print_buff, fp); return n; } #endif void warn_printf(FILE *fp, const char *format, ...) { char print_buff[8000]; va_list argptr; va_start(argptr, format); vsnprintf(print_buff, sizeof print_buff, format, argptr); va_end(argptr); warn++; fputs(print_buff, stderr); if (fp!=stderr) fputs(print_buff, fp); } void verb_printf(FILE *fp, const char *format, ...) { char print_buff[8000]; va_list argptr; va_start(argptr, format); vsnprintf(print_buff, sizeof print_buff, format, argptr); va_end(argptr); if (verb!=0) fputs(print_buff, stderr); if (fp!=stderr) fputs(print_buff, fp); } static int pnumconv2(struct page *p) { int j,k,cc,pclen; pclen=strlen(page_compositor); for (j=k=cc=0;jpage);j++) { if (strncmp(p->page+j,page_compositor,pclen)==0) { j+=pclen; k=j; cc++; continue; } } return pnumconv(p->page+k,p->attr[cc]); } /* write ind file */ void indwrite(char *filename, struct index *ind, int pagenum) { int i,j,k,q,hpoint=0,tpoint=0,bpoint[NUM_BRAHMIC]={0},block_open=0,*__point; char lbuff[BUFFERLEN],obuff[BUFFERLEN]; UChar initial[INITIALLENGTH],initial_prev[INITIALLENGTH],*__head; int chset,chset_prev; FILE *fp=NULL; UErrorCode perr; if (filename && kpse_out_name_ok(filename)) fp=fopen(filename,"wb"); if (fp == NULL) { fp=stdout; #ifdef WIN32 setmode(fileno(fp), _O_BINARY); #endif } fputs(preamble,fp); if (fpage>0) { fprintf(fp,"%s%d%s",setpage_prefix,pagenum,setpage_suffix); } perr=U_ZERO_ERROR; unormalizer_NFD =unorm2_getInstance(NULL, "nfc", UNORM2_DECOMPOSE, &perr); unormalizer_NFKD=unorm2_getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, &perr); if (strlen(symhead)==0) { if (lethead_flag>0) { strcpy(symhead, symhead_positive); } else if (lethead_flag<0) { strcpy(symhead, symhead_negative); } } { if (lethead_flag>0) { strcpy(numhead, numhead_positive); } else if (lethead_flag<0) { strcpy(numhead, numhead_negative); } } for (i=line_length=0;i=CH_DEVANAGARI && chset<=CH_LAO) { __head=brahmic_head[chset-CH_DEVANAGARI]; __point=&bpoint[chset-CH_DEVANAGARI]; if (lethead_flag!=0) { fputs(lethead_prefix,fp); for (j=*__point;j<(u_strlen(__head));) { if (initial_cmp_char(initial,&__head[j])) { k=j; U16_BACK_1(__head, 0, k); fprint_uchar(fp,&__head[k],M_NONE,1); *__point=j; break; } U16_FWD_1(__head, j, -1); } if (j==(u_strlen(__head))) { k=j; U16_BACK_1(__head, 0, k); fprint_uchar(fp,&__head[k],M_NONE,1); } fputs(lethead_suffix,fp); } for (*__point=0;*__point<(u_strlen(__head));) { if (initial_cmp_char(initial,&__head[*__point])) { break; } U16_FWD_1(__head, *__point, -1); } } else { if (lethead_flag!=0 && symbol_flag==2 && chset==CH_NUMERIC) { fprintf(fp,"%s%s%s",lethead_prefix,numhead,lethead_suffix); } if (lethead_flag!=0 && (symbol_flag==1 || (symbol_flag==2 && chset!=CH_NUMERIC))) { fprintf(fp,"%s%s%s",lethead_prefix,symhead,lethead_suffix); } } SPRINTF(lbuff,"%s",item_0[0]); k=0; } else { index_normalize(ind[i-1].dic[0], initial_prev, &chset_prev); if (chset!=chset_prev && is_any_script(chset_prev) && block_open) { if (strlen(script_postamble[chset_prev])) { fputs(script_postamble[chset_prev],fp); } block_open=0; } if (chset!=chset_prev && is_any_script(chset)) { if (strlen(script_preamble[chset])) { fputs(script_preamble[chset],fp); block_open=chset; } } if ((CH_LATIN<=chset&&chset<=CH_GREEK) || chset==CH_HANZI || (CH_ARABIC<=chset&&chset<=CH_HEBREW)) { if (chset!=chset_prev || ss_comp(initial,initial_prev)) { fputs(group_skip,fp); if (lethead_flag!=0) { fputs(lethead_prefix,fp); fprint_uchar(fp,initial,lethead_flag,-1); fputs(lethead_suffix,fp); } } } else if (chset==CH_KANA) { for (j=hpoint;j<(u_strlen(kana_head));) { if (initial_cmp_char(initial,&kana_head[j])) { break; } U16_FWD_1(kana_head, j, -1); } if ((j!=hpoint)||(j==0)) { hpoint=j; fputs(group_skip,fp); if (lethead_flag!=0) { k=j; U16_BACK_1(kana_head, 0, k); fputs(lethead_prefix,fp); fprint_uchar(fp,&kana_head[k],M_NONE,1); fputs(lethead_suffix,fp); } } } else if (chset==CH_HANGUL) { for (j=tpoint;j<(u_strlen(hangul_head));j++) { if (initial_cmp_char(initial,&hangul_head[j])) { break; } } if ((j!=tpoint)||(j==0)) { tpoint=j; fputs(group_skip,fp); if (lethead_flag!=0) { fputs(lethead_prefix,fp); fprint_uchar(fp,&hangul_head[j-1],M_NONE,1); fputs(lethead_suffix,fp); } } } else if (chset>=CH_DEVANAGARI && chset<=CH_LAO) { __head=brahmic_head[chset-CH_DEVANAGARI]; __point=&bpoint[chset-CH_DEVANAGARI]; for (j=*__point;j<(u_strlen(__head));) { if (initial_cmp_char(initial,&__head[j])) { break; } U16_FWD_1(__head, j, -1); } if ((j!=*__point)||(j==0)) { *__point=j; fputs(group_skip,fp); if (lethead_flag!=0) { k=j; U16_BACK_1(__head, 0, k); fputs(lethead_prefix,fp); fprint_uchar(fp,&__head[k],M_NONE,1); fputs(lethead_suffix,fp); } } } else { if (chset_prev!=chset) { if (is_any_script(chset_prev) || symbol_flag==2) fputs(group_skip,fp); if (lethead_flag!=0 && symbol_flag==2 && chset==CH_NUMERIC) { fprintf(fp,"%s%s%s",lethead_prefix,numhead,lethead_suffix); } if (lethead_flag!=0 && (symbol_flag==1 && is_any_script(chset_prev) || symbol_flag==2 && chset!=CH_NUMERIC) ) { fprintf(fp,"%s%s%s",lethead_prefix,symhead,lethead_suffix); } } } q=0; for (j=0;jj) { int epage = pnumconv2(&ind[num].p[cc]); int bpage = pnumconv2(&ind[num].p[j]); if (epage==bpage) { j=cc-1; continue; } /* range process */ if (ind[num].p[j].enc[0]==range_open || ind[num].p[j].enc[0]==range_close) ind[num].p[j].enc++; if (strlen(ind[num].p[j].enc)>0) { SPRINTF(buff,"%s%s%s",encap_prefix,ind[num].p[j].enc,encap_infix); } /* print beginning of range */ SAPPENDF(buff,"%s",ind[num].p[j].page); if (strlen(suffix_3p)>0 && epage-bpage==2) { SAPPENDF(buff,"%s",suffix_3p); } else if (strlen(suffix_mp)>0 && epage-bpage>=2) { SAPPENDF(buff,"%s",suffix_mp); } else if (strlen(suffix_2p)>0 && epage-bpage==1) { SAPPENDF(buff,"%s",suffix_2p); } else { SAPPENDF(buff,"%s",delim_r); SAPPENDF(buff,"%s",ind[num].p[cc].page); } SAPPENDF(tmpbuff,"%s",buff); buff[0]='\0'; if (strlen(ind[num].p[j].enc)>0) { SAPPENDF(tmpbuff,"%s",encap_suffix); } linecheck(lbuff,tmpbuff,fp, FALSE); j=cc; if (j==ind[num].num) { goto PRINT; } else { SAPPENDF(tmpbuff,"%s",delim_n); linecheck(lbuff,tmpbuff,fp, TRUE); } } else if (strlen(ind[num].p[j].enc)>0) { /* normal encap */ if (ind[num].p[j].enc[0]==range_close) { SPRINTF(errbuff,"Warning: Unmatched range closing operator \'%c\',",range_close); for (i=0;i0) { SAPPENDF(tmpbuff,"%s",encap_prefix); SAPPENDF(tmpbuff,"%s",ind[num].p[j].enc); SAPPENDF(tmpbuff,"%s",encap_infix); SAPPENDF(tmpbuff,"%s",ind[num].p[j].page); SAPPENDF(tmpbuff,"%s",encap_suffix); SAPPENDF(tmpbuff,"%s",delim_n); linecheck(lbuff,tmpbuff,fp, FALSE); } else { SAPPENDF(tmpbuff,"%s",ind[num].p[j].page); SAPPENDF(tmpbuff,"%s",delim_n); linecheck(lbuff,tmpbuff,fp, FALSE); } } else { /* no encap */ SAPPENDF(tmpbuff,"%s",ind[num].p[j].page); SAPPENDF(tmpbuff,"%s",delim_n); linecheck(lbuff,tmpbuff,fp, FALSE); } } if (ind[num].p[j].enc[0]==range_open) { SPRINTF(errbuff,"Warning: Unmatched range opening operator \'%c\',",range_open); for (k=0;k0) { SAPPENDF(tmpbuff,"%s",encap_prefix); SAPPENDF(tmpbuff,"%s",ind[num].p[j].enc); SAPPENDF(tmpbuff,"%s",encap_infix); SAPPENDF(tmpbuff,"%s",ind[num].p[j].page); SAPPENDF(tmpbuff,"%s",encap_suffix); } else { SAPPENDF(tmpbuff,"%s",ind[num].p[j].page); } linecheck(lbuff,tmpbuff,fp, FALSE); PRINT: fputs(lbuff,fp); fputs(delim_t,fp); lbuff[0]='\0'; } static int range_check(struct index ind, int count, char *lbuff, FILE *fp) { int i,j,k,cc1,cc2,start,force=0; char tmpbuff[BUFFERLEN],errbuff[BUFFERLEN],obuff[BUFFERLEN]; for (i=count;i0) { SPRINTF(tmpbuff,"%s%s%s%s%s%s",encap_prefix,ind.p[j].enc,encap_infix ,ind.p[j].page,encap_suffix,delim_n); linecheck(lbuff,tmpbuff,fp, FALSE); } } } if (j==ind.num+1) { SPRINTF(errbuff,"Warning: Unmatched range opening operator \'%c\',",range_open); for (k=0;k=cc2+2 || (cc1>=cc2+1 && strlen(suffix_2p)) || force) { return i-1; } else return count; } /* check line length */ static void linecheck(char *lbuff, char *tmpbuff, FILE *fp, int force) { if (line_length+strlen(tmpbuff)>line_max && !force) { SAPPENDF(lbuff,"\n"); fputs(lbuff,fp); lbuff[0]='\0'; SAPPENDF(lbuff,"%s",indent_space); SAPPENDF(lbuff,"%s",tmpbuff); line_length=indent_length+strlen(tmpbuff); tmpbuff[0]='\0'; } else { SAPPENDF(lbuff,"%s",tmpbuff); line_length+=strlen(tmpbuff); tmpbuff[0]='\0'; } } static void crcheck(char *lbuff, FILE *fp) { int i; char buff[BUFFERLEN]; for (i=strlen(lbuff);i>=0;i--) { if (lbuff[i]=='\n') { strncpy(buff,lbuff,i+1); buff[i+1]='\0'; fputs(buff,fp); strncpy(buff,&lbuff[i+1],BUFFERLEN-1); strcpy(lbuff,buff); break; } } } static void index_normalize(UChar *istr, UChar *ini, int *chset) { int k, hi, lo, mi; UChar ch,src[2],dest[8],strX[4],strY[4],strZ[4],strW[4]; UChar32 c32,cu; UErrorCode perr; UCollationResult order,order1,order2,order3,order4,order5,order6; UCollationStrength strgth; static int i_y_mode=0,o_o_mode=0,u_u_mode=0,v_w_mode=0,s_s_mode=0,t_t_mode=0,d_d_mode=0,e_e_mode=0; ch=istr[0]; *chset=charset(istr); ini[1]=L'\0'; if (is_circkana(ch) || is_hankana(ch) || is_sqkana(ch) || is_circlatin(ch)) { /* ㋐㋑㋒.. アイウ.. ㌀㌁㌂.. */ src[0]=ch; src[1]=0x00; perr=U_ZERO_ERROR; unorm2_normalize(unormalizer_NFKD, src, 1, dest, 8, &perr); if (U_SUCCESS(perr)) ch=dest[0]; } if (is_hiragana(ch)) { ch+=KATATOP-HIRATOP; /* hiragana -> katakana */ } if (is_katakana(ch)) { ini[0]=kanatable[ch-KATATOP]; return; } if (is_extkana(ch)) { /* ㇰㇱㇲㇳㇴ .. ㇻㇼㇽㇾㇿ */ ini[0]=extkanatable[ch-EXKANATOP]; return; } if (ch==0x309F) { ini[0]=0x3088; return; } /* HIRAGANA YORI -> よ */ else if (ch==0x30FF) { ini[0]=0x3053; return; } /* KATAKANA KOTO -> こ */ else if (is_jpn_kana(istr)==2) { c32=U16_GET_SUPPLEMENTARY(istr[0],istr[1]); switch (c32) { case 0x1F200: /* 🈀 */ ini[0]=0x307B; break; /* ほ */ case 0x1B000: /* 𛀀 */ ini[0]=0x3048; break; /* え */ case 0x1B11F: case 0x1B122: /* Archaic WU 𛄟 𛄢 */ ini[0]=0xD82C; ini[1]=0xDD1F; ini[2]=L'\0'; break; case 0x1B120: /* Archaic YI 𛄠 */ ini[0]=0xD82C; ini[1]=0xDD20; ini[2]=L'\0'; break; case 0x1B121: case 0x1B001: /* Archaic YE 𛄡 𛀁 */ ini[0]=0xD82C; ini[1]=0xDD21; ini[2]=L'\0'; break; case 0x1B132: case 0x1B155: ini[0]=0x3053; break; /* こ */ case 0x1B150: case 0x1B164: ini[0]=0x3090; break; /* ゐ */ case 0x1B151: case 0x1B165: ini[0]=0x3091; break; /* ゑ */ case 0x1B152: case 0x1B166: ini[0]=0x3092; break; /* を */ case 0x1B167: default: ini[0]=0x3093; break; /* ん */ } return; } else if (is_kor_hngl(&ch)) { if ((ch>=0xAC00)&&(ch<=0xD7AF)) { /* Hangul Syllables */ ch=(ch-0xAC00)/(21*28)+CHOSEONG_KIYEOK; /* convert to Hangul Jamo, Initial consonants */ } else switch (ch) { case 0x3131: case 0xFFA1: case 0x3200: case 0x320E: case 0x3260: case 0x326E: ch=0x1100; break; /* ᄀ */ case 0x3132: case 0xFFA2: ch=0x1101; break; /* ᄁ */ case 0x3134: case 0xFFA4: case 0x3201: case 0x320F: case 0x3261: case 0x326F: ch=0x1102; break; /* ᄂ */ case 0x3137: case 0xFFA7: case 0x3202: case 0x3210: case 0x3262: case 0x3270: ch=0x1103; break; /* ᄃ */ case 0x3138: case 0xFFA8: ch=0x1104; break; /* ᄄ */ case 0x3139: case 0xFFA9: case 0x3203: case 0x3211: case 0x3263: case 0x3271: ch=0x1105; break; /* ᄅ */ case 0x3141: case 0xFFB1: case 0x3204: case 0x3212: case 0x3264: case 0x3272: ch=0x1106; break; /* ᄆ */ case 0x3142: case 0xFFB2: case 0x3205: case 0x3213: case 0x3265: case 0x3273: ch=0x1107; break; /* ᄇ */ case 0x3143: case 0xFFB3: ch=0x1108; break; /* ᄈ */ case 0x3145: case 0xFFB5: case 0x3206: case 0x3214: case 0x3266: case 0x3274: ch=0x1109; break; /* ᄉ */ case 0x3146: case 0xFFB6: ch=0x110A; break; /* ᄊ */ case 0x3147: case 0xFFB7: case 0x3207: case 0x3215: case 0x3267: case 0x3275: case 0x321D: case 0x321E: case 0x327E: /* ㈝ ㈞ ㉾ */ ch=0x110B; break; /* ᄋ */ case 0x3148: case 0xFFB8: case 0x3208: case 0x3216: case 0x3268: case 0x3276: case 0x321C: case 0x327D: /* ㈜ ㉽ */ ch=0x110C; break; /* ᄌ */ case 0x3149: case 0xFFB9: ch=0x110D; break; /* ᄍ */ case 0x314A: case 0xFFBA: case 0x3209: case 0x3217: case 0x3269: case 0x3277: case 0x327C: /* ㉼ */ ch=0x110E; break; /* ᄎ */ case 0x314B: case 0xFFBB: case 0x320A: case 0x3218: case 0x326A: case 0x3278: ch=0x110F; break; /* ᄏ */ case 0x314C: case 0xFFBC: case 0x320B: case 0x3219: case 0x326B: case 0x3279: ch=0x1110; break; /* ᄐ */ case 0x314D: case 0xFFBD: case 0x320C: case 0x321A: case 0x326C: case 0x327A: ch=0x1111; break; /* ᄑ */ case 0x314E: case 0xFFBE: case 0x320D: case 0x321B: case 0x326D: case 0x327B: ch=0x1112; break; /* ᄒ */ } ini[0]=ch; return; } else if (is_hanzi(istr)) { if (hanzi_mode==0) hanzi_mode=init_hanzi_header(); if (hanzi_mode==HANZI_UNKNOWN) { u_strcpy(ini, hz_index[0].idx); return; } lo=0; hi=hz_index_len; while (lo=0x0E40 && istr[0]<=0x0E44) && (istr[1]>=0x0E01 && istr[1]<=0x0E2E)) || ((istr[0]>=0x0EC0 && istr[0]<=0x0EC4) && (istr[1]>=0x0E81 && istr[1]<=0x0EAE))) { /* Thai/Lao reordering :: Vowel followed by Consonant */ /* https://unicode-org.github.io/icu/userguide/collation/concepts.html#thailao-reordering */ ini[0]=istr[1]; } else if (istr[0]==0x0EDC || istr[0]==0x0EDD) { /* ໜ ໝ */ ini[0]=0xEAB; /* ຫ */ } else { ini[0]=istr[0]; } return; } else if (is_surrogate_pair(istr)&&is_brahmic(istr)) { ini[0]=istr[0]; ini[1]=istr[1]; ini[2]=L'\0'; return; } else if (is_brahmic(&ch)||is_arabic(&ch)||is_hebrew(&ch)) { if (ch==0x626) { /* Arabic Letter Yeh with Hamza Above for Uyghur */ strY[0]=0x626; strY[1]=L'\0'; /* Yeh with Hamza Above */ strZ[0]=0x628; strZ[1]=L'\0'; /* Beh */ order = ucol_strcoll(icu_collator, strZ, -1, strY, -1); if (order==UCOL_LESS) { ini[0]=strY[0]; ini[1]=strY[1]; return; } } /* Devanagary, Marathi */ if ( (istr[0]==0x915 && istr[1]==0x94D && istr[2]== 0x937) /* KSSA क्ष */ || (istr[0]==0x91C && istr[1]==0x94D && istr[2]== 0x91E)) { /* JNYA ज्ञ */ strY[0]=istr[0]; strY[1]=istr[1]; strY[2]=istr[2]; strY[3]=L'\0'; strZ[0]=0x939; strZ[1]=L'\0'; /* HA ह */ order = ucol_strcoll(icu_collator, strZ, -1, strY, -1); if (order==UCOL_LESS) { u_strcpy(ini,strY); return; } } /* Bengali */ if ( (istr[0]==0x995 && istr[1]==0x9CD && istr[2]== 0x9B7)) { /* KSSA ক্ষ */ strY[0]=istr[0]; strY[1]=istr[1]; strY[2]=istr[2]; strY[3]=L'\0'; strZ[0]=0x9B9; strZ[1]=L'\0'; /* HA হ */ order = ucol_strcoll(icu_collator, strZ, -1, strY, -1); if (order==UCOL_LESS) { u_strcpy(ini,strY); return; } } /* Oriya */ if ( (istr[0]==0xB15 && istr[1]==0xB4D && istr[2]== 0xB37)) { /* KSSA କ୍ଷ */ strY[0]=istr[0]; strY[1]=istr[1]; strY[2]=istr[2]; strY[3]=L'\0'; strZ[0]=0xB39; strZ[1]=L'\0'; /* HA ହ */ order = ucol_strcoll(icu_collator, strZ, -1, strY, -1); if (order==UCOL_LESS) { u_strcpy(ini,strY); return; } } if (ch==0x929||ch==0x931||ch==0x934||(0x958<=ch&&ch<=0x95F) /* Devanagary */ ||(0x622<=ch&&ch<=0x626)||ch==0x6C0||ch==0x6C2||ch==0x6D3 /* Arabic */ ||(0xFB50<=ch&&ch<=0xFDFF) /* Arabic Presentation Forms-A */ ||(0xFE70<=ch&&ch<=0xFEFF) /* Arabic Presentation Forms-B */ ||(0xFB1D<=ch&&ch<=0xFB4F) /* Hebrew presentation forms */ ||ch==0x9DC||ch==0x9DD||ch==0x9DF /* Bengali */ ||ch==0xA33||ch==0xA36||(0xA59<=ch&&ch<=0xA5B)||ch==0xA5E /* Gurmukhi */ ||ch==0xB5C||ch==0xB5D /* Oriya */ ||ch==0xB94 /* Tamil */ ) { src[0]=ch; src[1]=0x00; perr=U_ZERO_ERROR; unorm2_normalize(unormalizer_NFD, src, 1, dest, 8, &perr); if (U_SUCCESS(perr)) ch=dest[0]; /* without modifier */ } else if (ch==0x5DA||ch==0x5DD||ch==0x05DF||ch==0x5E3||ch==0x05E5) { /* Hebrew letter final */ ch++; } ini[0]=ch; return; } if (ch>=0xFF21&&ch<=0xFF3A || ch>=0xFF41&&ch<=0xFF5A) { /* Fullwidth latin letter */ ch-=0xFF21-0x0041; } cu = u_toupper(ch); if (cu==0x049||ch==0x130||ch==0x131||cu==0x0CE) { /* check dotted/dotless İ,I,i,ı and Î,î for Turkish */ if (turkish_i==0) { strgth = ucol_getStrength(icu_collator); ucol_setStrength(icu_collator, UCOL_SECONDARY); strX[0] = 0x131; strX[1] = 0x069; strX[2] = 0x00; /* ıi */ strZ[0] = 0x049; strZ[1] = 0x130; strZ[2] = 0x00; /* Iİ */ order = ucol_strcoll(icu_collator, strZ, -1, strX, -1); turkish_i = (order==UCOL_EQUAL) ? 2 : 1; ucol_setStrength(icu_collator, strgth); } if (turkish_i==2) { ini[0] = (ch==0x049||ch==0x131) ? 0x131 : 0x130; /* ı or İ */ return; } } if (cu==0x059) { /* check Y versus I for Lithuanian */ if (i_y_mode==0) { strgth = ucol_getStrength(icu_collator); ucol_setStrength(icu_collator, UCOL_PRIMARY); strX[0] = 0x059; strX[1] = 0x00; /* Y */ strZ[0] = 0x049; strZ[1] = 0x00; /* I */ order = ucol_strcoll(icu_collator, strZ, -1, strX, -1); i_y_mode = (order==UCOL_EQUAL) ? 2 : 1; ucol_setStrength(icu_collator, strgth); } if (i_y_mode==2) { ini[0] = 0x049; /* I */ return; } } if (cu==0x057) { /* check V versus W for Finnish */ if (v_w_mode==0) { strgth = ucol_getStrength(icu_collator); ucol_setStrength(icu_collator, UCOL_PRIMARY); strX[0] = 0x057; strX[1] = 0x00; /* W */ strZ[0] = 0x056; strZ[1] = 0x00; /* V */ order = ucol_strcoll(icu_collator, strZ, -1, strX, -1); v_w_mode = (order==UCOL_EQUAL) ? 2 : 1; ucol_setStrength(icu_collator, strgth); } if (v_w_mode==2) { ini[0] = 0x056; /* V */ return; } } if (cu==0x15E||cu==0x218) { /* check Ş versus Ș for Romanian */ if (s_s_mode<=2) { strgth = ucol_getStrength(icu_collator); strX[0] = ch; strX[1] = 0x00; /* myself */ strZ[0] = 0x053; strZ[1] = 0x00; /* S */ ucol_setStrength(icu_collator, UCOL_PRIMARY); order = ucol_strcoll(icu_collator, strZ, -1, strX, -1); if (s_s_mode==0) { strX[0] = 0x15E; strX[1] = 0x00; /* Ş */ strY[0] = 0x218; strY[1] = 0x00; /* Ș */ order1 = ucol_strcoll(icu_collator, strY, -1, strX, -1); if (order1==UCOL_EQUAL) s_s_mode = 3; } if (s_s_mode<3) { s_s_mode = (order==UCOL_EQUAL) ? 1 : 2; } ucol_setStrength(icu_collator, strgth); } if (s_s_mode==3) { strgth = ucol_getStrength(icu_collator); ucol_setStrength(icu_collator, UCOL_QUATERNARY); order = ucol_strcoll(icu_collator, strY, -1, strX, -1); s_s_mode = (order==UCOL_LESS) ? 5 : 4; ucol_setStrength(icu_collator, strgth); } if (s_s_mode==2) { ini[0] = cu; return; } if (s_s_mode==4) { ini[0] = 0x15E; return; } if (s_s_mode==5) { ini[0] = 0x218; return; } } if (cu==0x162||cu==0x21A) { /* check Ţ versus Ț for Romanian */ if (t_t_mode<=2) { strgth = ucol_getStrength(icu_collator); strX[0] = ch; strX[1] = 0x00; /* myself */ strZ[0] = 0x054; strZ[1] = 0x00; /* T */ ucol_setStrength(icu_collator, UCOL_PRIMARY); order = ucol_strcoll(icu_collator, strZ, -1, strX, -1); if (t_t_mode==0) { strX[0] = 0x162; strX[1] = 0x00; /* Ţ */ strY[0] = 0x21A; strY[1] = 0x00; /* Ț */ order1 = ucol_strcoll(icu_collator, strY, -1, strX, -1); if (order1==UCOL_EQUAL) t_t_mode = 3; } if (t_t_mode<3) { t_t_mode = (order==UCOL_EQUAL) ? 1 : 2; } ucol_setStrength(icu_collator, strgth); } if (t_t_mode==3) { strgth = ucol_getStrength(icu_collator); ucol_setStrength(icu_collator, UCOL_QUATERNARY); order = ucol_strcoll(icu_collator, strY, -1, strX, -1); t_t_mode = (order==UCOL_LESS) ? 5 : 4; ucol_setStrength(icu_collator, strgth); } if (t_t_mode==2) { ini[0] = cu; return; } if (t_t_mode==4) { ini[0] = 0x162; return; } if (t_t_mode==5) { ini[0] = 0x21A; return; } } if (cu==0x0D0||cu==0x110) { /* check ð,Ð versus đ,Đ for Danish, Norwegian, Finnish, Swedish, Icelandic */ if (d_d_mode<=2) { strgth = ucol_getStrength(icu_collator); strX[0] = ch; strX[1] = 0x00; /* myself */ strZ[0] = 0x044; strZ[1] = 0x00; /* D */ ucol_setStrength(icu_collator, UCOL_PRIMARY); order = ucol_strcoll(icu_collator, strZ, -1, strX, -1); if (d_d_mode==0 && order!=UCOL_EQUAL) { strX[0] = 0x0D0; strX[1] = 0x00; /* Ð, Eth */ strY[0] = 0x110; strY[1] = 0x00; /* Đ, D with Stroke */ order1 = ucol_strcoll(icu_collator, strY, -1, strX, -1); if (order1==UCOL_EQUAL) d_d_mode = 3; } if (d_d_mode<3) { d_d_mode = (order==UCOL_EQUAL) ? 1 : 2; } ucol_setStrength(icu_collator, strgth); } if (d_d_mode==3) { strgth = ucol_getStrength(icu_collator); ucol_setStrength(icu_collator, UCOL_QUATERNARY); order = ucol_strcoll(icu_collator, strY, -1, strX, -1); d_d_mode = (order==UCOL_LESS) ? 5 : 4; ucol_setStrength(icu_collator, strgth); } if (d_d_mode==1) { ini[0] = 0x044; return; } if (d_d_mode==2) { ini[0] = cu; return; } if (d_d_mode==4) { ini[0] = 0x0D0; return; } if (d_d_mode==5) { ini[0] = 0x110; return; } } if (cu==0x0D6||cu==0x150 ||cu==0x0D8||cu==0x0D5||cu==0x0D4) { /* check Ö,ö versus Ő,ő for Hungarian Ø,ø versus Ö,ö for Danish, Norwegian Ö,ö versus Ø,ø,Ő,ő,Õ,õ for Finnish SFS 4600 Ö,ö versus Ø,ø,Ő,ő,Ô,ô for Swedish */ if (o_o_mode==0) { strgth = ucol_getStrength(icu_collator); ucol_setStrength(icu_collator, UCOL_PRIMARY); strX[0] = 0x0D6; strX[1] = 0x00; /* Ö */ strY[0] = 0x0D8; strY[1] = 0x00; /* Ø */ strZ[0] = 0x04F; strZ[1] = 0x00; /* O */ order = ucol_strcoll(icu_collator, strZ, -1, strX, -1); order1 = ucol_strcoll(icu_collator, strZ, -1, strY, -1); if (order==UCOL_LESS || order1==UCOL_LESS) { o_o_mode = 2; } else { o_o_mode = 1; } ucol_setStrength(icu_collator, strgth); } if (o_o_mode==2) { strgth = ucol_getStrength(icu_collator); ucol_setStrength(icu_collator, UCOL_SECONDARY); strX[0] = 0x0D6; strX[1] = 0x00; /* Ö */ strY[0] = 0x0D8; strY[1] = 0x00; /* Ø */ strZ[0] = 0x150; strZ[1] = 0x00; /* Ő */ order2 = ucol_strcoll(icu_collator, strY, -1, strZ, -1); order3 = ucol_strcoll(icu_collator, strZ, -1, strX, -1); order4 = ucol_strcoll(icu_collator, strY, -1, strX, -1); strW[0] = 0x0D5; strZ[1] = 0x00; /* Õ */ strY[0] = 0x0D4; strY[1] = 0x00; /* Ô */ order5 = ucol_strcoll(icu_collator, strW, -1, strX, -1); order6 = ucol_strcoll(icu_collator, strY, -1, strX, -1); if (order1==UCOL_LESS && order4==UCOL_LESS) { o_o_mode = 3; /* O < Ø << Ö */ if (order2==UCOL_LESS) o_o_mode = 4; /* O < Ø << Ö and O < Ø << Ő */ } else if (order==UCOL_LESS && order4==UCOL_GREATER) { o_o_mode = 6; /* O < Ö << Ø */ if (order3==UCOL_GREATER) o_o_mode = 7; /* O < Ö << Ø and O < Ö << Ő */ if (order3==UCOL_GREATER && order5==UCOL_GREATER) o_o_mode = 8; /* O < Ö << Ø and O < Ö << Ő and O < Ö << Õ */ if (order3==UCOL_GREATER && order6==UCOL_GREATER) o_o_mode = 9; /* O < Ö << Ø and O < Ö << Ő and O < Ö << Ô */ } else if (order==UCOL_LESS && order3==UCOL_GREATER) { o_o_mode = 5; /* O < Ö << Ő */ } ucol_setStrength(icu_collator, strgth); } if ((o_o_mode==3 && (ch==0x0D6||ch==0x0F6)) || /* Ö */ (o_o_mode==4 && (ch==0x150||ch==0x151||ch==0x0D6||ch==0x0F6)) || /* Ö,Ő */ (o_o_mode>=3 && o_o_mode<=4 && (ch==0x0D8||ch==0x0F8))) { /* Ø */ ini[0] = 0x0D8; /* Ø */ return; } if ((o_o_mode==5 && (ch==0x150||ch==0x151)) || /* Ő */ (o_o_mode==6 && (ch==0x0D8||ch==0x0F8)) || /* Ø */ (o_o_mode==7 && (ch==0x150||ch==0x151||ch==0x0D8||ch==0x0F8)) || /* Ő,Ø */ (o_o_mode==8 && (ch==0x150||ch==0x151|| ch==0x0D8||ch==0x0F8||ch==0x0D5||ch==0x0F5)) || /* Ő,Ø,Õ */ (o_o_mode==9 && (ch==0x150||ch==0x151|| ch==0x0D8||ch==0x0F8||ch==0x0D4||ch==0x0F4)) || /* Ő,Ø,Ô */ (o_o_mode>=5 && o_o_mode<=9 && (ch==0x0D6||ch==0x0F6))) { /* Ö */ ini[0] = 0x0D6; /* Ö */ return; } } if (cu==0x0DC||cu==0x170) { /* check Ü,ü versus Ű,ű for Hungarian Ü,ü,Ű,ű versus Y for Danish, Norwegian, Swedish, Finnish */ if (u_u_mode==0) { strgth = ucol_getStrength(icu_collator); ucol_setStrength(icu_collator, UCOL_PRIMARY); strX[0] = 0x0DC; strX[1] = 0x00; /* Ü */ strY[0] = 0x170; strY[1] = 0x00; /* Ű */ strZ[0] = 0x055; strZ[1] = 0x00; /* U */ strW[0] = 0x059; strW[1] = 0x00; /* Y */ order = ucol_strcoll(icu_collator, strY, -1, strX, -1); order1 = ucol_strcoll(icu_collator, strZ, -1, strX, -1); order2 = ucol_strcoll(icu_collator, strW, -1, strX, -1); order3 = ucol_strcoll(icu_collator, strZ, -1, strY, -1); order4 = ucol_strcoll(icu_collator, strW, -1, strY, -1); ucol_setStrength(icu_collator, strgth); if (order1==UCOL_EQUAL && order3==UCOL_EQUAL) u_u_mode = 2; else if (order2==UCOL_EQUAL && order4==UCOL_EQUAL) u_u_mode = 3; else if (order1==UCOL_EQUAL && order4==UCOL_EQUAL) u_u_mode = 4; else if (order2==UCOL_EQUAL && order3==UCOL_EQUAL) u_u_mode = 5; else if (order1==UCOL_EQUAL) u_u_mode = 6; else if (order2==UCOL_EQUAL) u_u_mode = 7; else if (order3==UCOL_EQUAL) u_u_mode = 8; else if (order4==UCOL_EQUAL) u_u_mode = 9; else if (order ==UCOL_EQUAL) { ucol_setStrength(icu_collator, UCOL_SECONDARY); order5 = ucol_strcoll(icu_collator, strY, -1, strX, -1); ucol_setStrength(icu_collator, strgth); u_u_mode = (order5==UCOL_LESS) ? 11 : 10; } else u_u_mode = 1; } if (((u_u_mode==2||u_u_mode==4||u_u_mode==6) && (ch==0x0DC||ch==0x0FC)) || /* Ü */ ((u_u_mode==2||u_u_mode==5||u_u_mode==8) && (ch==0x170||ch==0x171))) { /* Ű */ ini[0] = 0x055; /* U */ return; } if (((u_u_mode==3||u_u_mode==5||u_u_mode==7) && (ch==0x0DC||ch==0x0FC)) || /* Ü */ ((u_u_mode==3||u_u_mode==4||u_u_mode==9) && (ch==0x170||ch==0x171))) { /* Ű */ ini[0] = 0x059; /* Y */ return; } if (u_u_mode==10) { ini[0] = 0x0DC; /* Ü */ return; } if (u_u_mode==11) { ini[0] = 0x170; /* Ű */ return; } } if (cu==0x118) { /* check Ę,ę versus Ä,ä,Æ,æ for Norwegian, Swedish */ if (e_e_mode==0) { strgth = ucol_getStrength(icu_collator); ucol_setStrength(icu_collator, UCOL_PRIMARY); strX[0] = 0x118; strX[1] = 0x00; /* Ę */ strY[0] = 0x0C4; strY[1] = 0x00; /* Ä */ strZ[0] = 0x0C6; strZ[1] = 0x00; /* Æ */ strW[0] = 0x045; strW[1] = 0x00; /* E */ order = ucol_strcoll(icu_collator, strW, -1, strX, -1); order1 = ucol_strcoll(icu_collator, strY, -1, strX, -1); order2 = ucol_strcoll(icu_collator, strZ, -1, strX, -1); order3 = ucol_strcoll(icu_collator, strZ, -1, strY, -1); ucol_setStrength(icu_collator, strgth); if (order ==UCOL_EQUAL) e_e_mode = 2; else if (order1==UCOL_EQUAL && order2==UCOL_EQUAL) { ucol_setStrength(icu_collator, UCOL_SECONDARY); order5 = ucol_strcoll(icu_collator, strZ, -1, strY, -1); ucol_setStrength(icu_collator, strgth); e_e_mode = (order5==UCOL_LESS) ? 4 : 3; } else if (order1==UCOL_EQUAL) e_e_mode = 5; else if (order2==UCOL_EQUAL) e_e_mode = 6; else e_e_mode = 1; } if (e_e_mode==3||e_e_mode==5) { ini[0] = 0x0C4; /* Ä */ return; } if (e_e_mode==4||e_e_mode==6) { ini[0] = 0x0C6; /* Æ */ return; } } if (cu==0x0C6||cu==0x0DE||ch==0x0DF||ch==0x1E9E ||cu==0x132||cu==0x13F||ch==0x149||cu==0x14A ||cu==0x152||cu==0x490) { strX[0] = cu; strX[1] = 0x00; /* ex. "Æ" "Œ" */ switch (cu) { case 0x0C6: /* Æ æ */ strZ[0] = 0x41; /* A */ strW[0] = 0xC4; break; /* Ä */ case 0x152: /* Œ œ */ strZ[0] = 0x4F; /* O */ strY[0] = 0xD8; /* Ø */ strW[0] = 0xD6; break; /* Ö */ case 0x0DF: case 0x1E9E: /* ß ẞ (u_toupper(ch) returns ch) */ strZ[0] = 0x53; break; /* S */ case 0x0DE: /* Þ þ */ strZ[0] = 0x54; break; /* T */ case 0x132: /* IJ ij */ strZ[0] = 0x59; /* Y */ strZ[1] = 0x00; if (ucol_equal(icu_collator, strZ, -1, strX, -1)) { ini[0]=0x59; return; } strZ[0] = 0x49; break; /* I */ case 0x13F: /* Ŀ ŀ */ strZ[0] = 0x4C; break; /* L */ case 0x149: /* ʼn (u_toupper(ch) returns ch) */ case 0x14A: /* Ŋ ŋ */ strZ[0] = 0x4E; break; /* N */ case 0x490: /* Ґ ґ */ strZ[0] = 0x413; break; /* Г */ } strZ[1] = (cu==0x490) ? 0x42F : 0x5A; strZ[2] = 0x00; /* ex. "AZ" "OZ" "ГЯ" */ order = ucol_strcoll(icu_collator, strZ, -1, strX, -1); if (order==UCOL_GREATER) { ini[0]=strZ[0]; return; } /* not ligature */ if (cu==0x0C6) { /* check Æ versus Ä for Danish, Norwegian, Finnish, Swedish */ strW[1] = 0x00; strgth = ucol_getStrength(icu_collator); ucol_setStrength(icu_collator, UCOL_PRIMARY); order = ucol_strcoll(icu_collator, strW, -1, strX, -1); ucol_setStrength(icu_collator, UCOL_SECONDARY); order1 = ucol_strcoll(icu_collator, strW, -1, strX, -1); strgth = ucol_getStrength(icu_collator); if (order==UCOL_EQUAL) { ini[0] = (order1==UCOL_LESS) ? strW[0] : strX[0]; return; } } if (cu==0x152) { /* check Œ versus Ö,Ø for Finnish, Norwegian, Swedish */ strY[1] = 0x00; strW[1] = 0x00; strgth = ucol_getStrength(icu_collator); ucol_setStrength(icu_collator, UCOL_PRIMARY); order = ucol_strcoll(icu_collator, strW, -1, strX, -1); order2 = ucol_strcoll(icu_collator, strY, -1, strX, -1); ucol_setStrength(icu_collator, UCOL_SECONDARY); order1 = ucol_strcoll(icu_collator, strW, -1, strX, -1); order3 = ucol_strcoll(icu_collator, strY, -1, strX, -1); order4 = ucol_strcoll(icu_collator, strY, -1, strW, -1); strgth = ucol_getStrength(icu_collator); if (order==UCOL_EQUAL && order2==UCOL_EQUAL) { if (order1!=UCOL_LESS && order3!=UCOL_LESS) ini[0] = strX[0]; else if (order4==UCOL_LESS) ini[0] = strY[0]; else ini[0] = strW[0]; return; } else if (order==UCOL_EQUAL) { ini[0] = (order1==UCOL_LESS) ? strW[0] : strX[0]; return; } else if (order2==UCOL_EQUAL) { ini[0] = (order3==UCOL_LESS) ? strY[0] : strX[0]; return; } } } else if ((is_latin(&ch)&&ch>0x7F)|| (is_cyrillic(&ch)&&(ch<0x410||ch==0x419||ch==0x439||ch>0x44F))|| (is_greek(&ch)&&(ch<0x391||(ch>0x3A9&&ch<0x3B1)||ch>0x3C9))) { /* check diacritic */ src[0]=ch; src[1]=0x00; perr=U_ZERO_ERROR; unorm2_normalize(unormalizer_NFD, src, 1, dest, 8, &perr); if (U_SUCCESS(perr)) { if (is_latin(&ch)) { strZ[1] = 0x05A; } /* Z */ else if (is_cyrillic(&ch)) { strZ[1] = 0x42F; } /* Я */ else { strZ[1] = 0x3A9; } /* Ω */ strZ[0] = u_toupper(dest[0]); strZ[2] = 0x00; /* ex. "AZ" */ strX[0] = cu; strX[1] = 0x00; /* ex. "Å" */ order = ucol_strcoll(icu_collator, strZ, -1, strX, -1); if (order==UCOL_LESS) { /* with diacritic */ if (strX[0]!=0xC4) { /* Ä */ ini[0]=strX[0]; return; } strZ[0] = 0x0C6; strZ[1] = 0x00; /* Æ */ strgth = ucol_getStrength(icu_collator); ucol_setStrength(icu_collator, UCOL_PRIMARY); order = ucol_strcoll(icu_collator, strZ, -1, strX, -1); ucol_setStrength(icu_collator, UCOL_SECONDARY); order1 = ucol_strcoll(icu_collator, strZ, -1, strX, -1); strgth = ucol_getStrength(icu_collator); ini[0] = (order==UCOL_EQUAL && order1==UCOL_LESS) ? strZ[0] : strX[0]; return; } ch=dest[0]; /* without diacritic */ } } if (is_latin(istr)&&u_strlen(istr)>1) { for(k=0; k<(u_strlen(istr)>2 ? 3 : 2); k++) { strX[k]=u_toupper(istr[k]); } strX[k]=L'\0'; /* DZ, SZ or DZS for Hungarian, ad-hoc treatment */ if ((strX[0]==0x44 || strX[0]==0x53) && strX[1]==0x5A) { /* DZ SZ */ strY[0]=0x44; strY[1]=0x5A; strY[2]=0x53; strY[3]=L'\0'; /* DZS */ strZ[0]=0x44; strZ[1]=0x5A; strZ[2]=0x5A; strZ[3]=L'\0'; /* DZZ */ order = ucol_strcoll(icu_collator, strZ, -1, strY, -1); if (order==UCOL_LESS) { ini[0]=strX[0]; ini[1]=strX[1]; if (strX[0]==0x44 && strX[2]==0x53) { /* DZS */ ini[2]=0x53; ini[3]=L'\0'; } else { /* DZ SZ */ ini[2]=L'\0'; } return; } } /* DZ, DŽ for Slovak or Serbo-Croatian, ad-hoc treatment */ if (strX[0]==0x44 && (strX[1]==0x5A || strX[1]==0x17D)) { /* DZ DŽ */ strY[0]=0x44; strY[1]=0x17D; strY[2]=L'\0'; /* DŽ */ strZ[0]=0x44; strZ[1]=0x5A; strZ[2]=0x5A; strZ[3]=L'\0'; /* DZZ */ order = ucol_strcoll(icu_collator, strZ, -1, strY, -1); if (order==UCOL_LESS) { if (strX[1]==0x5A) { strY[0]=0xD4; strY[1]=L'\0'; /* Ô */ strZ[0]=0x4F; strZ[1]=0x5A; strZ[2]=L'\0'; /* OZ */ order = ucol_strcoll(icu_collator, strZ, -1, strY, -1); if (order==UCOL_LESS) { /* Slovak DZ */ ini[0]=strX[0]; ini[1]=strX[1]; ini[2]=L'\0'; return; } } else { ini[0]=strX[0]; ini[1]=strX[1]; /* DŽ */ ini[2]=L'\0'; return; } } } /* NG for Welsh */ if (strX[0]==0x4E && strX[1]==0x47) { /* NG */ strY[0]=0x4E; strY[1]=L'\0'; /* N */ strZ[0]=0x4E; strZ[1]=0x47; strZ[2]=0x5A; strZ[3]=L'\0'; /* NGZ */ order = ucol_strcoll(icu_collator, strZ, -1, strY, -1); if (order==UCOL_LESS) { ini[0]=strX[0]; ini[1]=strX[1]; /* NG */ ini[2]=L'\0'; return; } } /* AA for Norwegian, Danish */ if (strX[0]==0x41 && strX[1]==0x41) { /* AA */ strX[2]=L'\0'; strY[0]=0xC5; strY[1]=L'\0'; /* Å */ strZ[0]=0x41; strZ[1]=0x42; strZ[3]=L'\0'; /* AB */ order = ucol_strcoll(icu_collator, strZ, -1, strX, -1); ucol_setStrength(icu_collator, UCOL_PRIMARY); order1 = ucol_strcoll(icu_collator, strY, -1, strX, -1); strgth = ucol_getStrength(icu_collator); if (order==UCOL_LESS) { if (order1==UCOL_EQUAL) { ini[0]=strY[0]; ini[1]=L'\0'; /* Å */ } else { ini[0]=strX[0]; ini[1]=strX[1]; /* AA */ ini[2]=L'\0'; } return; } } /* other digraphs */ if(((strX[0]==0x43 || strX[0]==0x44 || strX[0]==0x50 || strX[0]==0x52 || strX[0]==0x53 || strX[0]==0x54 || strX[0]==0x58 || strX[0]==0x5A) && strX[1]==0x48) || /* CH DH PH RH SH TH XH ZH */ (strX[0]==0x44 && strX[1]==0x44) || /* DD */ (strX[0]==0x46 && strX[1]==0x46) || /* FF */ (strX[0]==0x4C && strX[1]==0x4C) || /* LL */ ((strX[0]==0x47 || strX[0]==0x4C || strX[0]==0x4E) && strX[1]==0x4A) || /* GJ LJ NJ */ (strX[0]==0x52 && strX[1]==0x52) || /* RR */ ((strX[0]==0x43 || strX[0]==0x5A) && strX[1]==0x53) || /* CS ZS */ ((strX[0]==0x47 || strX[0]==0x4C || strX[0]==0x4E || strX[0]==0x54) && strX[1]==0x59)) /* GY LY NY TY */ { strX[2]=L'\0'; strZ[0]=strX[0]; strZ[1]=0x5A; strZ[2]=L'\0'; order = ucol_strcoll(icu_collator, strZ, -1, strX, -1); if (order==UCOL_LESS) { ini[0]=strX[0]; ini[1]=strX[1]; ini[2]=L'\0'; return; } } } if (ch==0x0AA) ch=L'A'; if (ch==0x0BA) ch=L'O'; ini[0]=u_toupper(ch); return; } static int initial_cmp_char(UChar *ini, UChar *ch) { UChar initial_tmp[INITIALLENGTH],istr[3]; int chset, l; l = is_surrogate_pair(ch) ? 2 : 1; istr[0]=ch[0]; if (l==2) istr[1]=ch[1]; istr[l]=L'\0'; index_normalize(istr, initial_tmp, &chset); return (ss_comp(ini, initial_tmp)<0); } static int init_hanzi_header(void) { UChar strX[2],*pch0,*pch1; int k, hzmode, len; struct hanzi_index *hz_idx_init; strX[0]=0x5B57; strX[1]=L'\0'; if (ucol_strcoll(icu_collator, strX, -1, HZ_RADICAL[0].threshold, -1)==UCOL_GREATER) { hzmode=HANZI_UNIHAN; hz_idx_init=HZ_RADICAL; } else if (ucol_strcoll(icu_collator, strX, -1, HZ_STROKE[0].threshold, -1)==UCOL_GREATER) { hzmode=HANZI_STROKE; hz_idx_init=HZ_STROKE; } else if (ucol_strcoll(icu_collator, strX, -1, HZ_PINYIN[0].threshold, -1)==UCOL_GREATER) { hzmode=HANZI_PINYIN; hz_idx_init=HZ_PINYIN; } else if (ucol_strcoll(icu_collator, strX, -1, HZ_ZHUYIN[0].threshold, -1)==UCOL_GREATER) { hzmode=HANZI_ZHUYIN; hz_idx_init=HZ_ZHUYIN; } else { hzmode=HANZI_UNKNOWN; hz_idx_init=HZ_UNKNOWN; } for (k=0;k0) { len=pch1-pch0; } else { len=u_strlen(pch0); } if (len>=INITIALLENGTH) { warn_printf(efp, "\nWarning: Too long (%d) hanzi header.\n", len); len=INITIALLENGTH-1; } u_strncpy(hz_index[k].idx,pch0,len); hz_index[k].idx[len]=L'\0'; if (pch1>0) pch0=pch1+1; else break; } return hzmode; }