下面列出了org.apache.lucene.index.IndexReader#maxDoc ( ) 实例代码,或者点击链接到github查看源代码,也可以在右侧发表评论。
@Override
public Query rewrite(IndexReader reader) throws IOException {
IndexReaderContext context = reader.getContext();
TermContext[] ctx = new TermContext[terms.length];
int[] docFreqs = new int[ctx.length];
for (int i = 0; i < terms.length; i++) {
ctx[i] = TermContext.build(context, terms[i]);
docFreqs[i] = ctx[i].docFreq();
}
final int maxDoc = reader.maxDoc();
blend(ctx, maxDoc, reader);
Query query = topLevelQuery(terms, ctx, docFreqs, maxDoc);
query.setBoost(getBoost());
return query;
}
@Override
protected int[] parseSet() throws IOException
{
IntSet WIDs = new AllWIDs(lang).getDataset();
int max_wid = 0;
for(int wid: WIDs)
if (wid > max_wid)
max_wid = wid;
IndexReader topics = Indexes.getReader(RepositoryDirs.TOPICS.getPath(lang));
int max = topics.maxDoc();
int[] map = new int[max_wid+1];
for(int i=0;i<map.length; i++) map[i]=-1;
PLogger plog = new PLogger(log, Step.MINUTE)
.setEnd(max)
.start();
for(int i=0;i<max;i++) {
map[Integer.parseInt(topics.document(i).get(TopicIndexer.FIELD_WID))] = i;
plog.update();
}
plog.stop();
return map;
}
/**
* Extracts all terms texts of a given Query into an array of WeightedTerms
*
* @param query Query to extract term texts from
* @param reader used to compute IDF which can be used to a) score selected fragments better
* b) use graded highlights eg changing intensity of font color
* @param fieldName the field on which Inverse Document Frequency (IDF) calculations are based
* @return an array of the terms used in a query, plus their weights.
*/
public static final WeightedTerm[] getIdfWeightedTerms(Query query, IndexReader reader, String fieldName)
{
WeightedTerm[] terms=getTerms(query,false, fieldName);
int totalNumDocs=reader.maxDoc();
for (int i = 0; i < terms.length; i++)
{
try
{
int docFreq=reader.docFreq(new Term(fieldName,terms[i].term));
//IDF algorithm taken from ClassicSimilarity class
float idf=(float)(Math.log(totalNumDocs/(double)(docFreq+1)) + 1.0);
terms[i].weight*=idf;
}
catch (IOException e)
{
//ignore
}
}
return terms;
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
Query rewritten = super.rewrite(reader);
if (rewritten != this) {
return rewritten;
}
IndexReaderContext context = reader.getContext();
TermStates[] ctx = new TermStates[terms.length];
int[] docFreqs = new int[ctx.length];
for (int i = 0; i < terms.length; i++) {
ctx[i] = TermStates.build(context, terms[i], true);
docFreqs[i] = ctx[i].docFreq();
}
final int maxDoc = reader.maxDoc();
blend(ctx, maxDoc, reader);
return topLevelQuery(terms, ctx, docFreqs, maxDoc);
}
public WordScorer(IndexReader reader, Terms terms, String field, double realWordLikelyHood, BytesRef separator) throws IOException {
this.field = field;
if (terms == null) {
throw new IllegalArgumentException("Field: [" + field + "] does not exist");
}
this.terms = terms;
final long vocSize = terms.getSumTotalTermFreq();
this.vocabluarySize = vocSize == -1 ? reader.maxDoc() : vocSize;
this.useTotalTermFreq = vocSize != -1;
this.numTerms = terms.size();
this.termsEnum = new FreqTermsEnum(reader, field, !useTotalTermFreq, useTotalTermFreq, null, BigArrays.NON_RECYCLING_INSTANCE); // non recycling for now
this.reader = reader;
this.realWordLikelyhood = realWordLikelyHood;
this.separator = separator;
}
/**
* Creates a Map of <code>WeightedSpanTerms</code> from the given <code>Query</code> and <code>TokenStream</code>. Uses a supplied
* <code>IndexReader</code> to properly weight terms (for gradient highlighting).
*
* <p>
*
* @param query
* that caused hit
* @param tokenStream
* of text to be highlighted
* @param fieldName
* restricts Term's used based on field name
* @param reader
* to use for scoring
* @return Map of WeightedSpanTerms with quasi tf/idf scores
* @throws IOException If there is a low-level I/O error
*/
public Map<String,WeightedSpanTerm> getWeightedSpanTermsWithScores(Query query, float boost, TokenStream tokenStream, String fieldName,
IndexReader reader) throws IOException {
if (fieldName != null) {
this.fieldName = fieldName;
} else {
this.fieldName = null;
}
this.tokenStream = tokenStream;
Map<String,WeightedSpanTerm> terms = new PositionCheckingMap<>();
extract(query, boost, terms);
int totalNumDocs = reader.maxDoc();
Set<String> weightedTerms = terms.keySet();
Iterator<String> it = weightedTerms.iterator();
try {
while (it.hasNext()) {
WeightedSpanTerm weightedSpanTerm = terms.get(it.next());
int docFreq = reader.docFreq(new Term(fieldName, weightedSpanTerm.term));
// IDF algorithm taken from ClassicSimilarity class
float idf = (float) (Math.log(totalNumDocs / (double) (docFreq + 1)) + 1.0);
weightedSpanTerm.weight *= idf;
}
} finally {
IOUtils.close(internalReader);
}
return terms;
}
private String [] bestTerms(String field,int numTerms) throws IOException {
PriorityQueue<TermDf> pq = new TermsDfQueue(numTerms);
IndexReader ir = DirectoryReader.open(dir);
try {
int threshold = ir.maxDoc() / 10; // ignore words too common.
Terms terms = MultiTerms.getTerms(ir, field);
if (terms != null) {
TermsEnum termsEnum = terms.iterator();
while (termsEnum.next() != null) {
int df = termsEnum.docFreq();
if (df<threshold) {
String ttxt = termsEnum.term().utf8ToString();
pq.insertWithOverflow(new TermDf(ttxt,df));
}
}
}
} finally {
ir.close();
}
String res[] = new String[pq.size()];
int i = 0;
while (pq.size()>0) {
TermDf tdf = pq.pop();
res[i++] = tdf.word;
System.out.println(i+". word: "+tdf.df+" "+tdf.word);
}
return res;
}
@Override
public Query rewrite(IndexReader reader) throws IOException {
if (this.terms.isEmpty()) {
return new MatchNoDocsQuery("CommonTermsQuery with no terms");
} else if (this.terms.size() == 1) {
return newTermQuery(this.terms.get(0), null);
}
final List<LeafReaderContext> leaves = reader.leaves();
final int maxDoc = reader.maxDoc();
final TermStates[] contextArray = new TermStates[terms.size()];
final Term[] queryTerms = this.terms.toArray(new Term[0]);
collectTermStates(reader, leaves, contextArray, queryTerms);
return buildQuery(maxDoc, contextArray, queryTerms);
}
public TaxonomyIndexArrays(IndexReader reader) throws IOException {
parents = new int[reader.maxDoc()];
if (parents.length > 0) {
initParents(reader, 0);
// Starting Lucene 2.9, following the change LUCENE-1542, we can
// no longer reliably read the parent "-1" (see comment in
// LuceneTaxonomyWriter.SinglePositionTokenStream). We have no way
// to fix this in indexing without breaking backward-compatibility
// with existing indexes, so what we'll do instead is just
// hard-code the parent of ordinal 0 to be -1, and assume (as is
// indeed the case) that no other parent can be -1.
parents[0] = TaxonomyReader.INVALID_ORDINAL;
}
}
private void initParents(IndexReader reader, int first) throws IOException {
if (reader.maxDoc() == first) {
return;
}
// it's ok to use MultiTerms because we only iterate on one posting list.
// breaking it to loop over the leaves() only complicates code for no
// apparent gain.
PostingsEnum positions = MultiTerms.getTermPostingsEnum(reader,
Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF,
PostingsEnum.PAYLOADS);
// shouldn't really happen, if it does, something's wrong
if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) {
throw new CorruptIndexException("Missing parent data for category " + first, reader.toString());
}
int num = reader.maxDoc();
for (int i = first; i < num; i++) {
if (positions.docID() == i) {
if (positions.freq() == 0) { // shouldn't happen
throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
}
parents[i] = positions.nextPosition();
if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
if (i + 1 < num) {
throw new CorruptIndexException("Missing parent data for category "+ (i + 1), reader.toString());
}
break;
}
} else { // this shouldn't happen
throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
}
}
}
public LuceneFlamdexReader(IndexReader reader, String directory, Collection<String> intFields, Collection<String> stringFields) {
super(directory, reader.maxDoc());
this.reader = reader;
this.intFields = intFields;
this.stringFields = stringFields;
}
@Override
public <T> void query(
@NonNull Collection<? super T> result,
@NonNull Convertor<? super Document, T> convertor,
@NullAllowed FieldSelector selector,
@NullAllowed AtomicBoolean cancel,
@NonNull Query... queries) throws IOException, InterruptedException {
Parameters.notNull("queries", queries); //NOI18N
Parameters.notNull("convertor", convertor); //NOI18N
Parameters.notNull("result", result); //NOI18N
if (selector == null) {
selector = AllFieldsSelector.INSTANCE;
}
lock.readLock().lock();
try {
final IndexReader in = getReader();
if (in == null) {
return;
}
final BitSet bs = new BitSet(in.maxDoc());
final Collector c = new BitSetCollector(bs);
final Searcher searcher = new IndexSearcher(in);
try {
for (Query q : queries) {
if (cancel != null && cancel.get()) {
throw new InterruptedException ();
}
searcher.search(q, c);
}
} finally {
searcher.close();
}
for (int docNum = bs.nextSetBit(0); docNum >= 0; docNum = bs.nextSetBit(docNum+1)) {
if (cancel != null && cancel.get()) {
throw new InterruptedException ();
}
final Document doc = in.document(docNum, selector);
final T value = convertor.convert(doc);
if (value != null) {
result.add (value);
}
}
} finally {
lock.readLock().unlock();
}
}
@Override
public <S, T> void queryDocTerms(
@NonNull Map<? super T, Set<S>> result,
@NonNull Convertor<? super Document, T> convertor,
@NonNull Convertor<? super Term, S> termConvertor,
@NullAllowed FieldSelector selector,
@NullAllowed AtomicBoolean cancel,
@NonNull Query... queries) throws IOException, InterruptedException {
Parameters.notNull("result", result); //NOI18N
Parameters.notNull("convertor", convertor); //NOI18N
Parameters.notNull("termConvertor", termConvertor); //NOI18N
Parameters.notNull("queries", queries); //NOI18N
if (selector == null) {
selector = AllFieldsSelector.INSTANCE;
}
lock.readLock().lock();
try {
final IndexReader in = getReader();
if (in == null) {
return;
}
final BitSet bs = new BitSet(in.maxDoc());
final Collector c = new BitSetCollector(bs);
final Searcher searcher = new IndexSearcher(in);
final TermCollector termCollector = new TermCollector(c);
try {
for (Query q : queries) {
if (cancel != null && cancel.get()) {
throw new InterruptedException ();
}
if (q instanceof TermCollector.TermCollecting) {
((TermCollector.TermCollecting)q).attach(termCollector);
} else {
throw new IllegalArgumentException (
String.format("Query: %s does not implement TermCollecting", //NOI18N
q.getClass().getName()));
}
searcher.search(q, termCollector);
}
} finally {
searcher.close();
}
for (int docNum = bs.nextSetBit(0); docNum >= 0; docNum = bs.nextSetBit(docNum+1)) {
if (cancel != null && cancel.get()) {
throw new InterruptedException ();
}
final Document doc = in.document(docNum, selector);
final T value = convertor.convert(doc);
if (value != null) {
final Set<Term> terms = termCollector.get(docNum);
if (terms != null) {
result.put (value, convertTerms(termConvertor, terms));
}
}
}
} finally {
lock.readLock().unlock();
}
}
public FilterableTermsEnum(IndexReader reader, String field, int docsEnumFlag, @Nullable Query filter) throws IOException {
if ((docsEnumFlag != PostingsEnum.FREQS) && (docsEnumFlag != PostingsEnum.NONE)) {
throw new IllegalArgumentException("invalid docsEnumFlag of " + docsEnumFlag);
}
this.docsEnumFlag = docsEnumFlag;
if (filter == null) {
// Important - need to use the doc count that includes deleted docs
// or we have this issue: https://github.com/elasticsearch/elasticsearch/issues/7951
numDocs = reader.maxDoc();
}
List<LeafReaderContext> leaves = reader.leaves();
List<Holder> enums = new ArrayList<>(leaves.size());
final Weight weight;
if (filter == null) {
weight = null;
} else {
final IndexSearcher searcher = new IndexSearcher(reader);
searcher.setQueryCache(null);
weight = searcher.createNormalizedWeight(filter, false);
}
for (LeafReaderContext context : leaves) {
Terms terms = context.reader().terms(field);
if (terms == null) {
continue;
}
TermsEnum termsEnum = terms.iterator();
if (termsEnum == null) {
continue;
}
BitSet bits = null;
if (weight != null) {
Scorer scorer = weight.scorer(context);
if (scorer == null) {
// fully filtered, none matching, no need to iterate on this
continue;
}
DocIdSetIterator docs = scorer.iterator();
// we want to force apply deleted docs
final Bits liveDocs = context.reader().getLiveDocs();
if (liveDocs != null) {
docs = new FilteredDocIdSetIterator(docs) {
@Override
protected boolean match(int doc) {
return liveDocs.get(doc);
}
};
}
BitDocIdSet.Builder builder = new BitDocIdSet.Builder(context.reader().maxDoc());
builder.or(docs);
bits = builder.build().bits();
// Count how many docs are in our filtered set
// TODO make this lazy-loaded only for those that need it?
numDocs += bits.cardinality();
}
enums.add(new Holder(termsEnum, bits));
}
this.enums = enums.toArray(new Holder[enums.size()]);
}
@Override
protected AnchorTrie parseSet() throws IOException
{
IndexReader anchors = Indexes.getReader(RepositoryDirs.ANCHORS.getPath(lang));
AnchorTrie trie = new AnchorTrie();
int maxdoc = anchors.maxDoc();
PLogger plog = new PLogger(log, Step.TEN_MINUTES, "anchors", "skipped", "duplicates");
plog.setEnd(0, maxdoc);
plog.start("Inserting in to trie...");
for(int i=0; i<maxdoc; i++)
{
plog.update(0);
Document doc = anchors.document(i);
if (doc == null){
plog.update(1);
continue;
}
String anchorText = doc.get(AnchorIndexer.FIELD_TEXT);
String serial = doc.get(AnchorIndexer.FIELD_OBJECT);
Anchor anchorObj = Anchor.deserialize(serial);
if (anchorObj == null){
plog.update(1);
continue;
}
boolean done = trie.add(anchorText, anchorObj);
if (!done) plog.update(2);
}
plog.stop();
log.info("Now trimming...");
trie.trim();
log.info("Done.");
return trie;
}
@Override
protected AnchorTernaryTrie parseSet() throws IOException
{
File indexDir = RepositoryDirs.ANCHORS.getDir(lang);
long indexSize = FileUtils.sizeOfDirectory(indexDir);
long maxMemory = Runtime.getRuntime().maxMemory();
IndexReader anchors;
if (indexSize < maxMemory * 0.8){
log.info("MaxMemory is enough, loading Anchor index...");
anchors = IndexReader.open(new RAMDirectory(new SimpleFSDirectory(indexDir)), true);
log.info("Anchor index loaded.");
} else {
log.info("Not enough memory ["+maxMemory/1000000+"Mb] to load Anchor index (about "+indexSize/1000000+"Mb)");
anchors = Indexes.getReader(RepositoryDirs.ANCHORS.getPath(lang));
}
AnchorTernaryTrie trie = new AnchorTernaryTrie();
int maxdoc = anchors.maxDoc();
IntList doclist = new IntArrayList();
for(int i=0;i<maxdoc;i++) doclist.add(i);
Random rnd = new Random(System.currentTimeMillis());
PLogger plog = new PLogger(log, Step.TEN_MINUTES, "anchors", "skipped", "duplicates");
plog.setEnd(0, maxdoc);
plog.start("Inserting in to trie...");
while(!doclist.isEmpty())
{
int docID = doclist.removeInt(rnd.nextInt(doclist.size()));
plog.update(0);
Document doc = anchors.document(docID);
if (doc == null){
plog.update(1);
continue;
}
String anchorText = doc.get(AnchorIndexer.FIELD_TEXT);
String serial = doc.get(AnchorIndexer.FIELD_OBJECT);
Anchor anchorObj = Anchor.deserialize(serial);
if (anchorObj == null){
plog.update(1);
continue;
}
boolean added = trie.add(anchorText, anchorObj);
if (!added) plog.update(2);
}
plog.stop();
return trie;
}
@Override
public void makeIndex(String lang, File workingDir) throws IOException
{
IndexReader articles = Indexes.getReader(RepositoryDirs.WIKIPEDIA.getPath(lang));
Int2ObjectMap<String> bestAnchorMap = new BestAnchors(lang).getDataset();
IndexWriter index = new IndexWriter(new SimpleFSDirectory(workingDir), new IndexWriterConfig(Version.LUCENE_34, new KeywordAnalyzer()));
Document doc = new Document();
Field fWID = new Field(FIELD_WID, "", Store.YES, Index.NOT_ANALYZED);
Field fTitle = new Field(FIELD_TITLE, "", Store.YES, Index.NOT_ANALYZED);
Field fAbstract = new Field(FIELD_ABSTRACT, "", Store.YES, Index.NO);
Field fBestAnchor = new Field(FIELD_BEST_ANCHOR, "", Store.YES, Index.NO);
doc.add(fWID);
doc.add(fTitle);
doc.add(fAbstract);
doc.add(fBestAnchor);
int max = articles.maxDoc();
PLogger plog = new PLogger(log, Step.TEN_MINUTES, "pages", "indexed", "noBest");
plog.setEnd(max);
plog.start("Start indexing...");
for(int i=0; i<max; i++)
{
plog.update(0);
Document oldDoc = articles.document(i);
PageType type = PageType.valueOf(oldDoc.get(WikipediaIndexer.FIELD_TYPE));
if (type == PageType.TOPIC)
{
int wid = Integer.parseInt(oldDoc.get(WikipediaIndexer.FIELD_WID));
fWID.setValue(oldDoc.get(WikipediaIndexer.FIELD_WID));
fAbstract.setValue(oldDoc.get(WikipediaIndexer.FIELD_ABSTRACT));
fTitle.setValue(oldDoc.get(WikipediaIndexer.FIELD_TITLE));
String bestAnchor = bestAnchorMap.get(wid);
if (bestAnchor == null || bestAnchor.length() == 0) plog.update(2);
fBestAnchor.setValue(bestAnchor==null?"":bestAnchor);
String[] cats = oldDoc.getValues(WikipediaIndexer.FIELD_CAT);
if (cats != null) {
for (int j=0; j<cats.length; j++)
doc.add(new Field(FIELD_CAT, cats[j], Store.YES, Index.NOT_ANALYZED));
}
index.addDocument(doc);
plog.update(1);
doc.removeFields(FIELD_CAT);
}
}
plog.stop();
log.info("Now optimizing...");
index.optimize();
index.close();
//we cannot call this because the index is still in the temporary dir
//so TopicDocs will be created using old index
// log.info("Index Done, now creating WID->DOC_ID map");
//
// TopicDocs td = new TopicDocs(lang);
// td.forceParsing();
log.info("Done.");
}
@SuppressWarnings("unused")
private void logIndexStats() {
try {
IndexReader reader = null;
try {
reader = getIndexReader();
Document doc;
int totalFields = 0;
Set<String> ids = new HashSet<>();
String[] idArray;
int count = 0;
for (int i = 0; i < reader.maxDoc(); i++) {
if (isDeleted(reader, i)) {
continue;
}
doc = readDocument(reader, i, null);
totalFields += doc.getFields().size();
count++;
idArray = doc.getValues("id");
for (String id : idArray) {
ids.add(id);
}
}
logger.info("Total documents in the index: " + reader.numDocs()
+ ", number of deletable documents in the index: " + reader.numDeletedDocs()
+ ", valid documents: " + count + ", total fields in all documents: " + totalFields
+ ", average number of fields per document: " + ((double) totalFields) / reader.numDocs());
logger.info("Distinct ids in the index: " + ids.size());
} finally {
ReaderMonitor toCloseCurrentMonitor = currentMonitor;
currentMonitor = null;
if (toCloseCurrentMonitor != null) {
toCloseCurrentMonitor.closeWhenPossible();
}
}
} catch (IOException e) {
logger.warn(e.getMessage(), e);
}
}
private static int computeExpMissing(int numDocsWithoutField, int numIndexedDocs, IndexReader reader) {
// The number of missing documents equals the number of docs without the field (not indexed with it, or were
// deleted). However, in case we deleted all documents in a segment before the reader was opened, there will be
// a mismatch between numDocs (how many we indexed) to reader.maxDoc(), so compensate for that.
return numDocsWithoutField - reader.numDeletedDocs() - (numIndexedDocs - reader.maxDoc());
}
ImageScorer(IndexReader reader, Bits liveDocs, Weight w) {
super(w, luceneFieldName, lireFeature, reader, ImageQuery.this.getBoost());
this.liveDocs = liveDocs;
maxDoc = reader.maxDoc();
}