Rev 1310 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
211 | jmachado | 1 | |
214 | jmachado | 2 | package jomm.utils; |
211 | jmachado | 3 | |
1350 | jmachado | 4 | import org.apache.lucene.analysis.Token; |
211 | jmachado | 5 | import org.apache.lucene.analysis.TokenFilter; |
6 | import org.apache.lucene.analysis.TokenStream; |
||
7 | |||
1350 | jmachado | 8 | import java.io.IOException; |
9 | import java.text.Normalizer; |
||
211 | jmachado | 10 | |
11 | |||
12 | /** |
||
13 | * Normalizes token text to lower case. |
||
14 | * |
||
15 | * @version $Id: DiacriticFilter.java,v 1.2 2007/12/27 01:45:58 jmachado Exp $ |
||
16 | */ |
||
17 | public final class DiacriticFilter extends TokenFilter |
||
18 | { |
||
19 | public DiacriticFilter(TokenStream in) |
||
20 | { |
||
21 | super(in); |
||
22 | } |
||
23 | |||
24 | public final Token next() throws 1.5.0/docs/api/java/io/IOException.html">IOException |
||
25 | { |
||
26 | Token t = input.next(); |
||
27 | |||
28 | if (t == null) |
||
29 | return null; |
||
30 | 1.5.0/docs/api/java/lang/String.html">String value = t.termText(); |
||
31 | return new Token(clean(value),t.startOffset(),t.endOffset()); |
||
32 | } |
||
33 | |||
1350 | jmachado | 34 | public static 1.5.0/docs/api/java/lang/String.html">String stripAccents(1.5.0/docs/api/java/lang/String.html">String s) |
35 | { |
||
36 | s = Normalizer.normalize(s, Normalizer.Form.NFD); |
||
37 | s = s.replaceAll("[\\p{InCombiningDiacriticalMarks}]", ""); |
||
38 | return s; |
||
39 | } |
||
211 | jmachado | 40 | public static 1.5.0/docs/api/java/lang/String.html">String clean(1.5.0/docs/api/java/lang/String.html">String value) |
41 | { |
||
42 | 1.5.0/docs/api/java/lang/StringBuilder.html">StringBuilder finalTerm = new 1.5.0/docs/api/java/lang/StringBuilder.html">StringBuilder(); |
||
43 | for (int j=0 ; j < value.length() ; j++) |
||
44 | { |
||
45 | char c = value.charAt(j); |
||
46 | switch(c) |
||
47 | { |
||
48 | case 131: c = 'f'; break; |
||
49 | case 138: c = 'S'; break; |
||
50 | case 140: c = 'E'; break; |
||
51 | case 154: c = 'S'; break; |
||
52 | case 156: c = 'e'; break; |
||
53 | case 159: c = 'Y'; break; |
||
54 | case 167: c = 'S'; break; |
||
55 | case 169: c = 'c'; break; |
||
56 | case 192: c = 'A'; break; |
||
57 | case 193: c = 'A'; break; |
||
58 | case 194: c = 'A'; break; |
||
59 | case 195: c = 'A'; break; |
||
60 | case 196: c = 'A'; break; |
||
61 | case 197: c = 'A'; break; |
||
62 | case 198: c = 'A'; break; |
||
63 | case 199: c = 'C'; break; |
||
64 | case 200: c = 'E'; break; |
||
65 | case 201: c = 'E'; break; |
||
66 | case 202: c = 'E'; break; |
||
67 | case 203: c = 'E'; break; |
||
68 | case 204: c = 'I'; break; |
||
69 | case 205: c = 'I'; break; |
||
70 | case 206: c = 'I'; break; |
||
71 | case 207: c = 'I'; break; |
||
72 | case 208: c = 'D'; break; |
||
73 | case 209: c = 'N'; break; |
||
74 | case 210: c = 'O'; break; |
||
75 | case 211: c = 'O'; break; |
||
76 | case 212: c = 'O'; break; |
||
77 | case 213: c = 'O'; break; |
||
78 | case 214: c = 'O'; break; |
||
79 | case 215: c = 'X'; break; |
||
80 | case 216: c = 'O'; break; |
||
81 | case 217: c = 'U'; break; |
||
82 | case 218: c = 'U'; break; |
||
83 | case 219: c = 'U'; break; |
||
84 | case 220: c = 'U'; break; |
||
85 | case 221: c = 'Y'; break; |
||
86 | case 222: c = 'P'; break; |
||
87 | case 223: c = 'B'; break; |
||
88 | case 224: c = 'a'; break; |
||
89 | case 225: c = 'a'; break; |
||
90 | case 226: c = 'a'; break; |
||
91 | case 227: c = 'a'; break; |
||
92 | case 228: c = 'a'; break; |
||
93 | case 229: c = 'a'; break; |
||
94 | case 230: c = 'a'; break; |
||
95 | case 231: c = 'c'; break; |
||
96 | case 232: c = 'e'; break; |
||
97 | case 233: c = 'e'; break; |
||
98 | case 234: c = 'e'; break; |
||
99 | case 235: c = 'e'; break; |
||
100 | case 236: c = 'i'; break; |
||
101 | case 237: c = 'i'; break; |
||
102 | case 238: c = 'i'; break; |
||
103 | case 239: c = 'i'; break; |
||
104 | case 240: c = 'o'; break; |
||
105 | case 241: c = 'n'; break; |
||
106 | case 242: c = 'o'; break; |
||
107 | case 243: c = 'o'; break; |
||
108 | case 244: c = 'o'; break; |
||
109 | case 245: c = 'o'; break; |
||
110 | case 246: c = 'o'; break; |
||
111 | case 247: break; |
||
112 | case 248: c = 'o'; break; |
||
113 | case 249: c = 'u'; break; |
||
114 | case 250: c = 'u'; break; |
||
115 | case 251: c = 'u'; break; |
||
116 | case 252: c = 'u'; break; |
||
117 | case 253: c = 'y'; break; |
||
118 | case 254: c = 'p'; break; |
||
119 | } |
||
120 | finalTerm.append(c); |
||
121 | } |
||
122 | return finalTerm.toString(); |
||
123 | } |
||
124 | |||
125 | public static void main(1.5.0/docs/api/java/lang/String.html">String args[]) |
||
126 | { |
||
127 | for(int i = 0; i < 255;i++) |
||
128 | 1.5.0/docs/api/java/lang/System.html">System.out.println(i + ":" + (char)i); |
||
129 | } |
||
130 | } |