Apache Solr的索引和查询顺序
拜读了solr的部分源码,却急于弄明白solr的索引顺序和查询顺序,如下是探访结果.
所有的配置都在solr/example/solr/conf/schema.xml当中.
1 <!-- 如下是对text类型的处理 -->
2 <fieldTypename="text"class="solr.TextField"positionIncrementGap="100"autoGeneratePhraseQueries="true">
3 <!-- 索引顺序1空格2同义词3过滤词4拆字5小写过滤6关键字7词干抽取算法-->
4 <analyzertype="index">
5 <tokenizerclass="solr.WhitespaceTokenizerFactory"/>
6 <!-- in this example, we will only use synonyms at query time
7 <filtersynonyms="index_synonyms.txt"ignoreCase="true"expand="false"/>
8 -->
9 <!-- Case insensitive stop word removal.
10 add enablePositionIncrements=true in both the index and query
11 analyzers to leave a 'gap' for more accurate phrase queries.
12 -->
13 <filterclass="solr.StopFilterFactory"
14 ignoreCase="true"
15 words="stopwords.txt"
16 enablePositionIncrements="true"
17 />
18 <filterclass="solr.WordDelimiterFilterFactory"generateWordParts="1"generateNumberParts="1"catenateWords="1"catenateNumbers="1"catenateAll="0"splitOnCaseChange="1"/>
19 <filterclass="solr.LowerCaseFilterFactory"/>
20 <filterclass="solr.KeywordMarkerFilterFactory"protected="protwords.txt"/>
21 <filterclass="solr.PorterStemFilterFactory"/>
22 </analyzer>
23 <!-- 查询顺序1空格2同义词3过滤词4拆字5小写过滤6关键字7词干抽取算法-->
24 <analyzertype="query">
25 <tokenizerclass="solr.WhitespaceTokenizerFactory"/>
26 <filterclass="solr.SynonymFilterFactory"synonyms="synonyms.txt"ignoreCase="true"expand="true"/>
27 <filterclass="solr.StopFilterFactory"
28 ignoreCase="true"
29 words="stopwords.txt"
30 enablePositionIncrements="true"
31 />
32 <filterclass="solr.WordDelimiterFilterFactory"generateWordParts="1"generateNumberParts="1"catenateWords="0"catenateNumbers="0"catenateAll="0"splitOnCaseChange="1"/>
33 <filterclass="solr.LowerCaseFilterFactory"/>
34 <filterclass="solr.KeywordMarkerFilterFactory"protected="protwords.txt"/>
35 <filterclass="solr.PorterStemFilterFactory"/>
36 </analyzer>
37 </fieldType>
38
39
40 <!-- Less flexible matching, but less false matches. Probably not ideal for product names,
41 but may be good for SKUs. Can insert dashes in the wrong place and still match. -->
42 <!-- 针对textTight类型-->
43 <fieldTypename="textTight"class="solr.TextField"positionIncrementGap="100">
44 <!-- 查询顺序1空格2同义词3过滤词4拆字5小写过滤6关键字7英文相近词8去除重复词
45 -->
46 <analyzer>
47 <tokenizerclass="solr.WhitespaceTokenizerFactory"/>
48 <filterclass="solr.SynonymFilterFactory"synonyms="synonyms.txt"ignoreCase="true"expand="false"/>
49 <filterclass="solr.StopFilterFactory"ignoreCase="true"words="stopwords.txt"/>
50 <filterclass="solr.WordDelimiterFilterFactory"generateWordParts="0"generateNumberParts="0"catenateWords="1"catenateNumbers="1"catenateAll="0"/>
51 <filterclass="solr.LowerCaseFilterFactory"/>
52 <filterclass="solr.KeywordMarkerFilterFactory"protected="protwords.txt"/>
53 <filterclass="solr.EnglishMinimalStemFilterFactory"/>
54 <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes
55 possible with WordDelimiterFilter in conjuncton with stemming. -->
56 <filterclass="solr.RemoveDuplicatesTokenFilterFactory"/>
57 </analyzer>
58 </fieldType>
59
60
61 <!-- A general unstemmed text field - good if one does not know the language of the field -->
62 <!-- 针对textgen类型 -->
63 <fieldTypename="textgen"class="solr.TextField"positionIncrementGap="100">
64 <!-- 索引顺序1空格2过滤词3拆字4小写过滤-->
65 <analyzertype="index">
66 <tokenizerclass="solr.WhitespaceTokenizerFactory"/>
67 <filterclass="solr.StopFilterFactory"ignoreCase="true"words="stopwords.txt"enablePositionIncrements="true"/>
68 <filterclass="solr.WordDelimiterFilterFactory"generateWordParts="1"generateNumberParts="1"catenateWords="1"catenateNumbers="1"catenateAll="0"splitOnCaseChange="0"/>
69 <filterclass="solr.LowerCaseFilterFactory"/>
70 </analyzer>
71 <!-- 查询顺序1空格2同义词3过滤词4小写过滤-->
72 <analyzertype="query">
73 <tokenizerclass="solr.WhitespaceTokenizerFactory"/>
74 <filterclass="solr.SynonymFilterFactory"synonyms="synonyms.txt"ignoreCase="true"expand="true"/>
75 <filterclass="solr.StopFilterFactory"
76 ignoreCase="true"
77 words="stopwords.txt"
78 enablePositionIncrements="true"
79 />
80 <filterclass="solr.WordDelimiterFilterFactory"generateWordParts="1"generateNumberParts="1"catenateWords="0"catenateNumbers="0"catenateAll="0"splitOnCaseChange="0"/>
81 <filterclass="solr.LowerCaseFilterFactory"/>
82 </analyzer>
83 </fieldType>
84
85
86 <!-- A general unstemmed text field that indexes tokens normally and also
87 reversed (via ReversedWildcardFilterFactory), to enable more efficient
88 leading wildcard queries. -->
89 <!-- 针对text_rev类型 -->
90 <fieldTypename="text_rev"class="solr.TextField"positionIncrementGap="100">
91 <!-- 索引顺序1空格2过滤词3拆字4小写过滤6转义通配符-->
92 <analyzertype="index">
93 <tokenizerclass="solr.WhitespaceTokenizerFactory"/>
94 <filterclass="solr.StopFilterFactory"ignoreCase="true"words="stopwords.txt"enablePositionIncrements="true"/>
95 <filterclass="solr.WordDelimiterFilterFactory"generateWordParts="1"generateNumberParts="1"catenateWords="1"catenateNumbers="1"catenateAll="0"splitOnCaseChange="0"/>
96 <filterclass="solr.LowerCaseFilterFactory"/>
97 <filterclass="solr.ReversedWildcardFilterFactory"withOriginal="true"
98 maxPosAsterisk="3"maxPosQuestion="2"maxFractionAsterisk="0.33"/>
99 </analyzer>
100 <!-- 查询顺序1空格2同义词3过滤词4拆字5小写过滤 -->
101 <analyzertype="query">
102 <tokenizerclass="solr.WhitespaceTokenizerFactory"/>
103 <filterclass="solr.SynonymFilterFactory"synonyms="synonyms.txt"ignoreCase="true"expand="true"/>
104 <filterclass="solr.StopFilterFactory"
105 ignoreCase="true"
106 words="stopwords.txt"
107 enablePositionIncrements="true"
108 />
109 <filterclass="solr.WordDelimiterFilterFactory"generateWordParts="1"generateNumberParts="1"catenateWords="0"catenateNumbers="0"catenateAll="0"splitOnCaseChange="0"/>
110 <filterclass="solr.LowerCaseFilterFactory"/>
111 </analyzer>
112 </fieldType>
113
114 <!-- charFilter + WhitespaceTokenizer -->
115 <!--
116 <fieldTypename="textCharNorm"positionIncrementGap="100">
117 <analyzer>
118 <charFiltermapping="mapping-ISOLatin1Accent.txt"/>
119 <tokenizer/>
120 </analyzer>
121 </fieldType>
122 -->
123
124 <!-- This is an example of using the KeywordTokenizer along
125 With various TokenFilterFactories to produce a sortable field
126 that does not include some properties of the source text
127 -->
128 <fieldTypename="alphaOnlySort"class="solr.TextField"sortMissingLast="true"omitNorms="true">
129 <analyzer>
130 <!-- KeywordTokenizer does no actual tokenizing, so the entire
131 input string is preserved as a single token
132 -->
133 <tokenizerclass="solr.KeywordTokenizerFactory"/>
134 <!-- The LowerCase TokenFilter does what you expect, which can be
135 when you want your sorting to be case insensitive
136 -->
137 <filterclass="solr.LowerCaseFilterFactory"/>
138 <!-- The TrimFilter removes any leading or trailing whitespace -->
139 <filterclass="solr.TrimFilterFactory"/>
140 <!-- The PatternReplaceFilter gives you the flexibility to use
141 Java Regular expression to replace any sequence of characters
142 matching a pattern with an arbitrary replacement string,
143 which may include back references to portions of the original
144 string matched by the pattern.
145
146 See the Java Regular Expression documentation for more
147 information on pattern and replacement string syntax.
148
149 http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/package-summary.html
150 -->
151 <filterclass="solr.PatternReplaceFilterFactory"
152 pattern="([^a-z])"replacement=""replace="all"
153 />
154 </analyzer>
155 </fieldType>
156
157 <fieldtypename="phonetic"stored="false"indexed="true"class="solr.TextField">
158 <analyzer>
159 <tokenizerclass="solr.StandardTokenizerFactory"/>
160 <filterclass="solr.DoubleMetaphoneFilterFactory"inject="false"/>
161 </analyzer>
162 </fieldtype>
163
164 <fieldtypename="payloads"stored="false"indexed="true"class="solr.TextField">
165 <analyzer>
166 <tokenizerclass="solr.WhitespaceTokenizerFactory"/>
167 <!--
168 The DelimitedPayloadTokenFilter can put payloads on tokens... for example,
169 a token of "foo|1.4" would be indexed as "foo" with a payload of 1.4f
170 Attributes of the DelimitedPayloadTokenFilterFactory :
171 "delimiter" - a one character delimiter. Default is | (pipe)
172 "encoder" - how to encode the following value into a playload
173 float -> org.apache.lucene.analysis.payloads.FloatEncoder,
174 integer -> o.a.l.a.p.IntegerEncoder
175 identity -> o.a.l.a.p.IdentityEncoder
176 Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor.
177 -->
178 <filterclass="solr.DelimitedPayloadTokenFilterFactory"encoder="float"/>
179 </analyzer>
180 </fieldtype>
181
182 <!-- lowercases the entire field value, keeping it as a single token. -->
183 <fieldTypename="lowercase"class="solr.TextField"positionIncrementGap="100">
184 <analyzer>
185 <tokenizerclass="solr.KeywordTokenizerFactory"/>
186 <filterclass="solr.LowerCaseFilterFactory"/>
187 </analyzer>
188 </fieldType>
相关文章