Logo Search packages:      
Sourcecode: beagle version File versions  Download package

DocumentWriter.cs

/*
 * Copyright 2004 The Apache Software Foundation
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 * http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

using System;
using Analyzer = Lucene.Net.Analysis.Analyzer;
using Token = Lucene.Net.Analysis.Token;
using TokenStream = Lucene.Net.Analysis.TokenStream;
using Document = Lucene.Net.Documents.Document;
using Field = Lucene.Net.Documents.Field;
using Similarity = Lucene.Net.Search.Similarity;
using Directory = Lucene.Net.Store.Directory;
using IndexOutput = Lucene.Net.Store.IndexOutput;

namespace Lucene.Net.Index
{
      
      public sealed class DocumentWriter
      {
            private void  InitBlock()
            {
                  termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL;
            }
            private Analyzer analyzer;
            private Directory directory;
            private Similarity similarity;
            private FieldInfos fieldInfos;
            private int maxFieldLength;
            private int termIndexInterval;
            private System.IO.TextWriter infoStream;
            
            /// <summary>This ctor used by test code only.
            /// 
            /// </summary>
            /// <param name="directory">The directory to write the document information to
            /// </param>
            /// <param name="analyzer">The analyzer to use for the document
            /// </param>
            /// <param name="similarity">The Similarity function
            /// </param>
            /// <param name="maxFieldLength">The maximum number of tokens a field may have
            /// </param>
            public DocumentWriter(Directory directory, Analyzer analyzer, Similarity similarity, int maxFieldLength)
            {
                  InitBlock();
                  this.directory = directory;
                  this.analyzer = analyzer;
                  this.similarity = similarity;
                  this.maxFieldLength = maxFieldLength;
            }
            
            public DocumentWriter(Directory directory, Analyzer analyzer, IndexWriter writer)
            {
                  InitBlock();
                  this.directory = directory;
                  this.analyzer = analyzer;
                  this.similarity = writer.GetSimilarity();
                  this.maxFieldLength = writer.GetMaxFieldLength();
                  this.termIndexInterval = writer.GetTermIndexInterval();
            }
            
            public /*internal*/ void  AddDocument(System.String segment, Document doc)
            {
                  // write field names
                  fieldInfos = new FieldInfos();
                  fieldInfos.Add(doc);
                  fieldInfos.Write(directory, segment + ".fnm");
                  
                  // write field values
                  FieldsWriter fieldsWriter = new FieldsWriter(directory, segment, fieldInfos);
                  try
                  {
                        fieldsWriter.AddDocument(doc);
                  }
                  finally
                  {
                        fieldsWriter.Close();
                  }
                  
                  // invert doc into postingTable
                  postingTable.Clear(); // clear postingTable
                  fieldLengths = new int[fieldInfos.Size()]; // init fieldLengths
                  fieldPositions = new int[fieldInfos.Size()]; // init fieldPositions
                  fieldOffsets = new int[fieldInfos.Size()]; // init fieldOffsets
                  
                  fieldBoosts = new float[fieldInfos.Size()]; // init fieldBoosts
            float boost = doc.GetBoost();
            for (int i = 0; i < fieldBoosts.Length; i++)
            {
                fieldBoosts[i] = boost;
            }
                  
                  InvertDocument(doc);
                  
                  // sort postingTable into an array
                  Posting[] postings = SortPostingTable();
                  
                  /*
                  for (int i = 0; i < postings.length; i++) {
                  Posting posting = postings[i];
                  System.out.print(posting.term);
                  System.out.print(" freq=" + posting.freq);
                  System.out.print(" pos=");
                  System.out.print(posting.positions[0]);
                  for (int j = 1; j < posting.freq; j++)
                  System.out.print("," + posting.positions[j]);
                  System.out.println("");
                  }
                  */
                  
                  // write postings
                  WritePostings(postings, segment);
                  
                  // write norms of indexed fields
                  WriteNorms(segment);
            }
            
            // Keys are Terms, values are Postings.
            // Used to buffer a document before it is written to the index.
            private System.Collections.Hashtable postingTable = System.Collections.Hashtable.Synchronized(new System.Collections.Hashtable());
            private int[] fieldLengths;
            private int[] fieldPositions;
            private int[] fieldOffsets;
            private float[] fieldBoosts;
            
            // Tokenizes the fields of a document into Postings.
            private void  InvertDocument(Document doc)
            {
                  foreach(Field field in doc.Fields())
                  {
                        System.String fieldName = field.Name();
                        int fieldNumber = fieldInfos.FieldNumber(fieldName);
                        
                        int length = fieldLengths[fieldNumber]; // length of field
                        int position = fieldPositions[fieldNumber]; // position in field
                        if (length > 0)
                              position += analyzer.GetPositionIncrementGap(fieldName);
                        int offset = fieldOffsets[fieldNumber]; // offset field
                        
                        if (field.IsIndexed())
                        {
                              if (!field.IsTokenized())
                              {
                                    // un-tokenized field
                                    System.String stringValue = field.StringValue();
                                    if (field.IsStoreOffsetWithTermVector())
                                          AddPosition(fieldName, stringValue, position++, new TermVectorOffsetInfo(offset, offset + stringValue.Length));
                                    else
                                          AddPosition(fieldName, stringValue, position++, null);
                                    offset += stringValue.Length;
                                    length++;
                              }
                              else
                              {
                                    System.IO.TextReader reader; // find or make Reader
                                    if (field.ReaderValue() != null)
                                          reader = field.ReaderValue();
                                    else if (field.StringValue() != null)
                                          reader = new System.IO.StringReader(field.StringValue());
                                    else
                                          throw new System.ArgumentException("field must have either String or Reader value");
                                    
                                    // Tokenize field and add to postingTable
                                    TokenStream stream = analyzer.TokenStream(fieldName, reader);
                                    try
                                    {
                                          Token lastToken = null;
                                          for (Token t = stream.Next(); t != null; t = stream.Next())
                                          {
                                                position += (t.GetPositionIncrement() - 1);
                                                
                                                if (field.IsStoreOffsetWithTermVector())
                                                      AddPosition(fieldName, t.TermText(), position++, new TermVectorOffsetInfo(offset + t.StartOffset(), offset + t.EndOffset()));
                                                else
                                                      AddPosition(fieldName, t.TermText(), position++, null);
                                                
                                                lastToken = t;
                                                if (++length > maxFieldLength)
                                                {
                                                      if (infoStream != null)
                                                            infoStream.WriteLine("maxFieldLength " + maxFieldLength + " reached, ignoring following tokens");
                                                      break;
                                                }
                                          }
                                          
                                          if (lastToken != null)
                                                offset += lastToken.EndOffset() + 1;
                                    }
                                    finally
                                    {
                                          stream.Close();
                                    }
                              }
                              
                              fieldLengths[fieldNumber] = length; // save field length
                              fieldPositions[fieldNumber] = position; // save field position
                              fieldBoosts[fieldNumber] *= field.GetBoost();
                              fieldOffsets[fieldNumber] = offset;
                        }
                  }
            }
            
            private Term termBuffer = new Term("", ""); // avoid consing
            
            private void  AddPosition(System.String field, System.String text, int position, TermVectorOffsetInfo offset)
            {
                  termBuffer.Set(field, text);
                  //System.out.println("Offset: " + offset);
                  Posting ti = (Posting) postingTable[termBuffer];
                  if (ti != null)
                  {
                        // word seen before
                        int freq = ti.freq;
                        if (ti.positions.Length == freq)
                        {
                              // positions array is full
                              int[] newPositions = new int[freq * 2]; // double size
                              int[] positions = ti.positions;
                              for (int i = 0; i < freq; i++)
                              // copy old positions to new
                                    newPositions[i] = positions[i];
                              ti.positions = newPositions;
                        }
                        ti.positions[freq] = position; // add new position
                        
                        if (offset != null)
                        {
                              if (ti.offsets.Length == freq)
                              {
                                    TermVectorOffsetInfo[] newOffsets = new TermVectorOffsetInfo[freq * 2];
                                    TermVectorOffsetInfo[] offsets = ti.offsets;
                                    for (int i = 0; i < freq; i++)
                                    {
                                          newOffsets[i] = offsets[i];
                                    }
                                    ti.offsets = newOffsets;
                              }
                              ti.offsets[freq] = offset;
                        }
                        ti.freq = freq + 1; // update frequency
                  }
                  else
                  {
                        // word not seen before
                        Term term = new Term(field, text, false);
                        postingTable[term] = new Posting(term, position, offset);
                  }
            }
            
            private Posting[] SortPostingTable()
            {
                  // copy postingTable into an array
                  Posting[] array = new Posting[postingTable.Count];
                  System.Collections.IEnumerator postings = postingTable.Values.GetEnumerator();
                  for (int i = 0; postings.MoveNext(); i++)
                  {
                        array[i] = (Posting) postings.Current;
                  }
                  
                  // sort the array
                  QuickSort(array, 0, array.Length - 1);
                  
                  return array;
            }
            
            private static void  QuickSort(Posting[] postings, int lo, int hi)
            {
                  if (lo >= hi)
                        return ;
                  
                  int mid = (lo + hi) / 2;
                  
                  if (postings[lo].term.CompareTo(postings[mid].term) > 0)
                  {
                        Posting tmp = postings[lo];
                        postings[lo] = postings[mid];
                        postings[mid] = tmp;
                  }
                  
                  if (postings[mid].term.CompareTo(postings[hi].term) > 0)
                  {
                        Posting tmp = postings[mid];
                        postings[mid] = postings[hi];
                        postings[hi] = tmp;
                        
                        if (postings[lo].term.CompareTo(postings[mid].term) > 0)
                        {
                              Posting tmp2 = postings[lo];
                              postings[lo] = postings[mid];
                              postings[mid] = tmp2;
                        }
                  }
                  
                  int left = lo + 1;
                  int right = hi - 1;
                  
                  if (left >= right)
                        return ;
                  
                  Term partition = postings[mid].term;
                  
                  for (; ; )
                  {
                        while (postings[right].term.CompareTo(partition) > 0)
                              --right;
                        
                        while (left < right && postings[left].term.CompareTo(partition) <= 0)
                              ++left;
                        
                        if (left < right)
                        {
                              Posting tmp = postings[left];
                              postings[left] = postings[right];
                              postings[right] = tmp;
                              --right;
                        }
                        else
                        {
                              break;
                        }
                  }
                  
                  QuickSort(postings, lo, left);
                  QuickSort(postings, left + 1, hi);
            }
            
            private void  WritePostings(Posting[] postings, System.String segment)
            {
                  IndexOutput freq = null, prox = null;
                  TermInfosWriter tis = null;
                  TermVectorsWriter termVectorWriter = null;
                  try
                  {
                        //open files for inverse index storage
                        freq = directory.CreateOutput(segment + ".frq");
                        prox = directory.CreateOutput(segment + ".prx");
                        tis = new TermInfosWriter(directory, segment, fieldInfos, termIndexInterval);
                        TermInfo ti = new TermInfo();
                        System.String currentField = null;
                        
                        for (int i = 0; i < postings.Length; i++)
                        {
                              Posting posting = postings[i];
                              
                              // add an entry to the dictionary with pointers to prox and freq files
                              ti.Set(1, freq.GetFilePointer(), prox.GetFilePointer(), - 1);
                              tis.Add(posting.term, ti);
                              
                              // add an entry to the freq file
                              int postingFreq = posting.freq;
                              if (postingFreq == 1)
                              // optimize freq=1
                                    freq.WriteVInt(1);
                              // set low bit of doc num.
                              else
                              {
                                    freq.WriteVInt(0); // the document number
                                    freq.WriteVInt(postingFreq); // frequency in doc
                              }
                              
                              int lastPosition = 0; // write positions
                              int[] positions = posting.positions;
                              for (int j = 0; j < postingFreq; j++)
                              {
                                    // use delta-encoding
                                    int position = positions[j];
                                    prox.WriteVInt(position - lastPosition);
                                    lastPosition = position;
                              }
                              // check to see if we switched to a new field
                              System.String termField = posting.term.Field();
                              if (currentField != termField)
                              {
                                    // changing field - see if there is something to save
                                    currentField = termField;
                                    FieldInfo fi = fieldInfos.FieldInfo(currentField);
                                    if (fi.storeTermVector)
                                    {
                                          if (termVectorWriter == null)
                                          {
                                                termVectorWriter = new TermVectorsWriter(directory, segment, fieldInfos);
                                                termVectorWriter.OpenDocument();
                                          }
                                          termVectorWriter.OpenField(currentField);
                                    }
                                    else if (termVectorWriter != null)
                                    {
                                          termVectorWriter.CloseField();
                                    }
                              }
                              if (termVectorWriter != null && termVectorWriter.IsFieldOpen())
                              {
                                    termVectorWriter.AddTerm(posting.term.Text(), postingFreq, posting.positions, posting.offsets);
                              }
                        }
                        if (termVectorWriter != null)
                              termVectorWriter.CloseDocument();
                  }
                  finally
                  {
                        // make an effort to close all streams we can but remember and re-throw
                        // the first exception encountered in this process
                        System.IO.IOException keep = null;
                        if (freq != null)
                              try
                              {
                                    freq.Close();
                              }
                              catch (System.IO.IOException e)
                              {
                                    if (keep == null)
                                          keep = e;
                              }
                        if (prox != null)
                              try
                              {
                                    prox.Close();
                              }
                              catch (System.IO.IOException e)
                              {
                                    if (keep == null)
                                          keep = e;
                              }
                        if (tis != null)
                              try
                              {
                                    tis.Close();
                              }
                              catch (System.IO.IOException e)
                              {
                                    if (keep == null)
                                          keep = e;
                              }
                        if (termVectorWriter != null)
                              try
                              {
                                    termVectorWriter.Close();
                              }
                              catch (System.IO.IOException e)
                              {
                                    if (keep == null)
                                          keep = e;
                              }
                        if (keep != null)
                        {
                              throw new System.IO.IOException(keep.StackTrace);
                        }
                  }
            }
            
            private void  WriteNorms(System.String segment)
            {
                  for (int n = 0; n < fieldInfos.Size(); n++)
                  {
                        FieldInfo fi = fieldInfos.FieldInfo(n);
                        if (fi.isIndexed && !fi.omitNorms)
                        {
                              float norm = fieldBoosts[n] * similarity.LengthNorm(fi.name, fieldLengths[n]);
                              IndexOutput norms = directory.CreateOutput(segment + ".f" + n);
                              try
                              {
                                    norms.WriteByte(Similarity.EncodeNorm(norm));
                              }
                              finally
                              {
                                    norms.Close();
                              }
                        }
                  }
            }
            
            /// <summary>If non-null, a message will be printed to this if maxFieldLength is reached.</summary>
            internal void  SetInfoStream(System.IO.TextWriter infoStream)
            {
                  this.infoStream = infoStream;
            }
      }
      
      sealed class Posting
      {
            // info about a Term in a doc
            internal Term term; // the Term
            internal int freq; // its frequency in doc
            internal int[] positions; // positions it occurs at
            internal TermVectorOffsetInfo[] offsets;
            
            internal Posting(Term t, int position, TermVectorOffsetInfo offset)
            {
                  term = t;
                  freq = 1;
                  positions = new int[1];
                  positions[0] = position;
                  if (offset != null)
                  {
                        offsets = new TermVectorOffsetInfo[1];
                        offsets[0] = offset;
                  }
                  else
                        offsets = null;
            }
      }
}

Generated by  Doxygen 1.6.0   Back to index