Index: src/uk/ac/gla/terrier/applications/TRECLMIndexing.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/applications/TRECLMIndexing.java,v
retrieving revision 1.14
diff -w -u -r1.14 TRECLMIndexing.java
--- src/uk/ac/gla/terrier/applications/TRECLMIndexing.java	29 Jan 2009 15:39:56 -0000	1.14
+++ src/uk/ac/gla/terrier/applications/TRECLMIndexing.java	26 Feb 2009 16:11:46 -0000
@@ -69,11 +69,15 @@
 			return;
 		}
 	
+		try{
 		CreateTermEstimateIndex teIndex = new CreateTermEstimateIndex(index, modelName);
 		teIndex.createTermEstimateIndex();
 		
 		CreateDocumentInitialWeightIndex docWIndex = new CreateDocumentInitialWeightIndex(index, modelName);
 		docWIndex.createDocumentInitialWeightIndex();
+		} catch (Exception e) {
+			logger.error("Could not make LM structures", e);
+		}
 	}
 	
 	/** 
Index: src/uk/ac/gla/terrier/compression/BitFile.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/compression/BitFile.java,v
retrieving revision 1.32
diff -w -u -r1.32 BitFile.java
--- src/uk/ac/gla/terrier/compression/BitFile.java	28 Jan 2009 20:16:45 -0000	1.32
+++ src/uk/ac/gla/terrier/compression/BitFile.java	26 Feb 2009 16:11:47 -0000
@@ -316,6 +316,11 @@
 		return this;
 	}
 	
+
+	public BitIn readReset(long startByteOffset, byte startBitOffset) throws IOException {
+		throw new IOException("Unsupported");
+	}
+	
 	/**
 	 * Reads a gamma encoded integer from the underlying stream
 	 * @return the number read
@@ -635,4 +640,5 @@
 		if(b > 0 ) return readMinimalBinary(b);
 		else return 0;
 	}
+
 }
Index: src/uk/ac/gla/terrier/compression/BitFileInMemory.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/compression/BitFileInMemory.java,v
retrieving revision 1.5
diff -w -u -r1.5 BitFileInMemory.java
--- src/uk/ac/gla/terrier/compression/BitFileInMemory.java	28 Jan 2009 20:16:45 -0000	1.5
+++ src/uk/ac/gla/terrier/compression/BitFileInMemory.java	26 Feb 2009 16:11:47 -0000
@@ -95,6 +95,11 @@
 		return new BitInReader(startByteOffset, startBitOffset, endByteOffset, endBitOffset);
 	}
 
+	public BitIn readReset(long startByteOffset, byte startBitOffset) 
+	{
+		return new BitInReader(startByteOffset, startBitOffset);
+	}
+
 	/** Close this object. Does nothing. */
 	public void close()
 	{
@@ -107,12 +112,16 @@
     	protected int bitOffset;
 		protected int readByteOffset;
 		
-		public BitInReader(long startByteOffset, byte startBitOffset, long endByteOffset, byte endBitOffset)
+		public BitInReader(long startByteOffset, byte startBitOffset)
 		{
-			
 			readByteOffset = (int)startByteOffset;
 			bitOffset = startBitOffset;
 		}
+		
+		public BitInReader(long startByteOffset, byte startBitOffset, long endByteOffset, byte endBitOffset)
+		{
+			this(startByteOffset, startBitOffset);
+		}
 		/**
 		* Returns the byte offset of the stream.
 		* It corresponds to the position of the
Index: src/uk/ac/gla/terrier/compression/BitInSeekable.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/compression/BitInSeekable.java,v
retrieving revision 1.3
diff -w -u -r1.3 BitInSeekable.java
--- src/uk/ac/gla/terrier/compression/BitInSeekable.java	28 Jan 2009 20:16:45 -0000	1.3
+++ src/uk/ac/gla/terrier/compression/BitInSeekable.java	26 Feb 2009 16:11:47 -0000
@@ -52,4 +52,17 @@
 	 * @return Returns the BitIn object to use to read that data
 	 */	
 	public BitIn readReset(long startByteOffset, byte startBitOffset, long endByteOffset, byte endBitOffset) throws IOException;
+	
+	/**
+	 * Reads from the file a specific number of bytes and after this
+	 * call, a sequence of read calls may follow. The offsets given 
+	 * as arguments are inclusive. For example, if we call this method
+	 * with arguments 0, 2, 1, 7, it will read in a buffer the contents 
+	 * of the underlying file from the third bit of the first byte to the 
+	 * last bit of the second byte.
+	 * @param startByteOffset the starting byte to read from
+	 * @param startBitOffset the bit offset in the starting byte
+	 * @return Returns the BitIn object to use to read that data
+	 */
+	public BitIn readReset(long startByteOffset, byte startBitOffset) throws IOException;
 }
Index: src/uk/ac/gla/terrier/indexing/BasicIndexer.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/indexing/BasicIndexer.java,v
retrieving revision 1.47
diff -w -u -r1.47 BasicIndexer.java
--- src/uk/ac/gla/terrier/indexing/BasicIndexer.java	28 Jan 2009 20:16:46 -0000	1.47
+++ src/uk/ac/gla/terrier/indexing/BasicIndexer.java	26 Feb 2009 16:11:47 -0000
@@ -35,10 +35,7 @@
 import uk.ac.gla.terrier.structures.indexing.DocumentPostingList;
 import uk.ac.gla.terrier.structures.indexing.InvertedIndexBuilder;
 import uk.ac.gla.terrier.structures.indexing.LexiconBuilder;
-import uk.ac.gla.terrier.structures.indexing.UTFInvertedIndexBuilder;
-import uk.ac.gla.terrier.structures.indexing.UTFLexiconBuilder;
 import uk.ac.gla.terrier.terms.TermPipeline;
-import uk.ac.gla.terrier.utility.ApplicationSetup;
 import uk.ac.gla.terrier.utility.FieldScore;
 import uk.ac.gla.terrier.utility.TermCodes;
 /** 
@@ -171,15 +168,7 @@
 	public void createDirectIndex(Collection[] collections)
 	{
 		currentIndex = Index.createNewIndex(path, prefix);
-		if (UTFIndexing)
-		{
-			lexiconBuilder = new UTFLexiconBuilder(currentIndex);
-		}
-		else
-		{
-			lexiconBuilder = new LexiconBuilder(currentIndex);
-		}
-		
+		lexiconBuilder = new LexiconBuilder(currentIndex, "lexicon");
 		directIndexBuilder = new DirectIndexBuilder(currentIndex);
 		docIndexBuilder = new DocumentIndexBuilder(currentIndex);
 				
@@ -372,16 +361,8 @@
 
 
 		//generate the inverted index
-		if (UTFIndexing)
-		{
-			logger.info("Started building the UTF inverted index...");
-			invertedIndexBuilder = new UTFInvertedIndexBuilder(currentIndex);
-		}
-		else
-		{
 			logger.info("Started building the inverted index...");
-			invertedIndexBuilder = new InvertedIndexBuilder(currentIndex);
-		}
+		invertedIndexBuilder = new InvertedIndexBuilder(currentIndex, "inverted");
 		
 		invertedIndexBuilder.createInvertedIndex();
 		finishedInvertedIndexBuild();
@@ -421,13 +402,6 @@
 	/** Hook method, called when the inverted index is finished - ie the lexicon is finished */
 	protected void finishedInvertedIndexBuild()
 	{
-		if (Boolean.parseBoolean(ApplicationSetup.getProperty("lexicon.use.hash","true"))) {
-			logger.debug("Building lexicon hash");
-			try{
-				LexiconBuilder.createLexiconHash(currentIndex);
-			} catch (IOException ioe) {
-				logger.warn("Problem creating (optional) Lexicon Hash", ioe);
-			}
-		}
+		LexiconBuilder.optimise(currentIndex, "lexicon");
 	}
 }
Index: src/uk/ac/gla/terrier/indexing/BasicSinglePassIndexer.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/indexing/BasicSinglePassIndexer.java,v
retrieving revision 1.11
diff -w -u -r1.11 BasicSinglePassIndexer.java
--- src/uk/ac/gla/terrier/indexing/BasicSinglePassIndexer.java	28 Jan 2009 20:16:46 -0000	1.11
+++ src/uk/ac/gla/terrier/indexing/BasicSinglePassIndexer.java	26 Feb 2009 16:11:47 -0000
@@ -32,14 +32,12 @@
 import java.util.LinkedList;
 import java.util.Queue;
 
+import uk.ac.gla.terrier.structures.BasicLexiconEntry;
 import uk.ac.gla.terrier.structures.Index;
-import uk.ac.gla.terrier.structures.LexiconInputStream;
 import uk.ac.gla.terrier.structures.LexiconOutputStream;
-import uk.ac.gla.terrier.structures.UTFLexiconInputStream;
-import uk.ac.gla.terrier.structures.UTFLexiconOutputStream;
+import uk.ac.gla.terrier.structures.MapFileLexiconOutputStream;
 import uk.ac.gla.terrier.structures.indexing.DocumentIndexBuilder;
 import uk.ac.gla.terrier.structures.indexing.DocumentPostingList;
-import uk.ac.gla.terrier.structures.indexing.LexiconBuilder;
 import uk.ac.gla.terrier.structures.indexing.singlepass.FieldPostingInRun;
 import uk.ac.gla.terrier.structures.indexing.singlepass.FieldsMemoryPostings;
 import uk.ac.gla.terrier.structures.indexing.singlepass.FileRunIteratorFactory;
@@ -228,15 +226,19 @@
 			try{
 				mp.finish(finishMemoryPosting());
 			}catch(Exception e){
-				e.printStackTrace();
+				logger.error("Problem creating index", e);
 			}
 			endCollection = System.currentTimeMillis();
 			long partialTime = (endCollection-startCollection)/1000;
 			logger.info("Collection #"+collectionNo+ " took "+partialTime+ " seconds to build the runs for "+numberOfDocuments+" documents\n");
 			logger.info("Merging "+fileNames.size()+" runs...");
 			startCollection = System.currentTimeMillis();
+			try{
 			performMultiWayMerge();
 			docIndexBuilder.finishedCollections();
+			} catch (Exception e) {
+				logger.error("Problem finishing index", e);
+			}
 			endCollection = System.currentTimeMillis();
 			logger.info("Collection #"+collectionNo+" took "+((endCollection-startCollection)/1000)+" seconds to merge\n ");
 			logger.info("Collection #"+collectionNo+" total time "+( (endCollection-startCollection)/1000+partialTime));
@@ -305,15 +307,15 @@
 	 * in a set of previously written runs.
 	 * The file names and the number of runs are given by the private queue
 	 */
-	public void performMultiWayMerge(){
+	public void performMultiWayMerge() throws IOException {
 		String[][] fileNames = getFileNames();
-		LexiconOutputStream lexStream = createLexiconOutputStream(path, prefix);
+		LexiconOutputStream<String> lexStream = new MapFileLexiconOutputStream(this.currentIndex, "lexicon", BasicLexiconEntry.Factory.class);
 		try{
 			if (useFieldInformation)
 				createFieldRunMerger(fileNames);
 			else
 				createRunMerger(fileNames);
-			merger.beginMerge(fileNames.length, path + ApplicationSetup.FILE_SEPARATOR + prefix +  ApplicationSetup.IFSUFFIX);
+			merger.beginMerge(fileNames.length, path + ApplicationSetup.FILE_SEPARATOR + prefix +  ".inverted.bf");
 			while(!merger.isDone()){
 				merger.mergeOne(lexStream);
 			}
@@ -330,17 +332,16 @@
 			currentIndex.setIndexProperty("num.Terms", ""+numberOfUniqueTerms);
 			currentIndex.setIndexProperty("num.Pointers", ""+numberOfPointers);
 			currentIndex.setIndexProperty("num.Tokens", ""+numberOfTokens);
-			createLexicon(numberOfUniqueTerms);
 			currentIndex.addIndexStructure(
 					"inverted",
 					invertedIndexClass,
-					"uk.ac.gla.terrier.structures.Lexicon,java.lang.String,java.lang.String",
-					"lexicon,path,prefix");
+					"uk.ac.gla.terrier.structures.Index,java.lang.String", 
+					"index,structureName");
 			currentIndex.addIndexStructureInputStream(
                     "inverted",
-                    invertedIndexInputStreamClass,
-                    "java.lang.String,java.lang.String,uk.ac.gla.terrier.structures.LexiconInputStream",
-                    "path,prefix,lexicon-inputstream");
+                    "uk.ac.gla.terrier.structures.InvertedIndexInputStream",
+                    "uk.ac.gla.terrier.structures.Index,java.lang.String,java.util.Iterator",
+                    "index,structureName,lexicon-inputstream");
 			currentIndex.setIndexProperty("num.inverted.fields.bits", ""+FieldScore.FIELDS_COUNT );
 		}catch(Exception e){
 			logger.error("Problem in performMultiWayMerge", e);
@@ -359,36 +360,6 @@
 		return files;
 	}
 
-	/**
-	 * Hook method that creates the right LexiconBuilder instance
-	 * @throws IOException
-	 */
-	protected void createLexicon(int numberOfEntries) throws IOException{
-		final LexiconInputStream lis = createLexiconInputStream(path, prefix);
-		LexiconBuilder.createLexiconIndex(lis, numberOfEntries, lis.getEntrySize(), path, prefix );
-		currentIndex.addIndexStructure(
-				"lexicon",
-				UTFIndexing ? "uk.ac.gla.terrier.structures.UTFLexicon" :"uk.ac.gla.terrier.structures.Lexicon" );
-		currentIndex.addIndexStructureInputStream(
-				"lexicon",
-				UTFIndexing ? "uk.ac.gla.terrier.structures.UTFLexiconInputStream" :"uk.ac.gla.terrier.structures.LexiconInputStream");
-	}
-
-	/**
-	 * Hook method that creates the rigth LexiconOutputStream instance.
- 	 * @param name filename for the lexicon file.
-	 */
-	protected LexiconOutputStream createLexiconOutputStream(String path, String prefix){
-		return UTFIndexing ? new UTFLexiconOutputStream(path, prefix) : new LexiconOutputStream(path, prefix);
-	}
-
-	/**
-	 * Hook method that creates the rigth LexiconOutputStream instance.
- 	 * @param name filename for the lexicon file.
-	 */
-	protected LexiconInputStream createLexiconInputStream(String path, String prefix){
-		return UTFIndexing ? new UTFLexiconInputStream(path, prefix) : new LexiconInputStream(path, prefix);
-	}
 
 	/**
 	 * Hook method that creates a FieldRunMerger instance
Index: src/uk/ac/gla/terrier/indexing/BlockIndexer.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/indexing/BlockIndexer.java,v
retrieving revision 1.49
diff -w -u -r1.49 BlockIndexer.java
--- src/uk/ac/gla/terrier/indexing/BlockIndexer.java	28 Jan 2009 20:16:46 -0000	1.49
+++ src/uk/ac/gla/terrier/indexing/BlockIndexer.java	26 Feb 2009 16:11:47 -0000
@@ -26,9 +26,11 @@
  * Rodrygo Santo <rodrygo{a.}dcs.gla.ac.uk>
  */
 package uk.ac.gla.terrier.indexing;
+import gnu.trove.THashSet;
+
 import java.io.IOException;
 import java.util.Set;
-import gnu.trove.THashSet;
+
 import uk.ac.gla.terrier.structures.FilePosition;
 import uk.ac.gla.terrier.structures.Index;
 import uk.ac.gla.terrier.structures.indexing.BlockDirectIndexBuilder;
@@ -38,8 +40,6 @@
 import uk.ac.gla.terrier.structures.indexing.DocumentIndexBuilder;
 import uk.ac.gla.terrier.structures.indexing.DocumentPostingList;
 import uk.ac.gla.terrier.structures.indexing.LexiconBuilder;
-import uk.ac.gla.terrier.structures.indexing.UTFBlockInvertedIndexBuilder;
-import uk.ac.gla.terrier.structures.indexing.UTFBlockLexiconBuilder;
 import uk.ac.gla.terrier.terms.TermPipeline;
 import uk.ac.gla.terrier.utility.ApplicationSetup;
 import uk.ac.gla.terrier.utility.FieldScore;
@@ -281,14 +281,7 @@
 			(Boolean.parseBoolean(ApplicationSetup.getProperty("block.delimiters.enabled", "false"))
 			? " delimited-block indexing enabled" : ""));
 		currentIndex = Index.createNewIndex(path, prefix);
-		if (UTFIndexing)
-		{
-			lexiconBuilder = new UTFBlockLexiconBuilder(currentIndex);
-		}
-		else
-		{
-			lexiconBuilder = new BlockLexiconBuilder(currentIndex);
-		}
+		lexiconBuilder = new BlockLexiconBuilder(currentIndex, "lexicon");
 		directIndexBuilder = new BlockDirectIndexBuilder(currentIndex);
 		docIndexBuilder = new DocumentIndexBuilder(currentIndex);
 		//int LexiconCount = 0;
@@ -412,16 +405,8 @@
 			return;
 		}
 
-		if (UTFIndexing)
-		{
-			logger.info("Started building the utf block inverted index...");
-			invertedIndexBuilder = new UTFBlockInvertedIndexBuilder(currentIndex);
-		}
-		else
-		{
 			logger.info("Started building the block inverted index...");
-			invertedIndexBuilder = new BlockInvertedIndexBuilder(currentIndex);
-		}
+		invertedIndexBuilder = new BlockInvertedIndexBuilder(currentIndex, "inverted");
 		invertedIndexBuilder.createInvertedIndex();
 		this.finishedInvertedIndexBuild();
 		currentIndex.flush();
@@ -459,14 +444,7 @@
 	/** Hook method, called when the inverted index is finished - ie the lexicon is finished */
 	protected void finishedInvertedIndexBuild()
 	{
-		if (Boolean.parseBoolean(ApplicationSetup.getProperty("lexicon.use.hash","true"))) {
-			logger.debug("Building lexicon hash");
-			try{
-				LexiconBuilder.createLexiconHash(currentIndex);
-			} catch (IOException ioe) {
-				logger.warn("Problem creating (optional) Lexicon Hash", ioe);
-			}
-		}
+		LexiconBuilder.optimise(currentIndex, "lexicon");
 	}
 
 	
Index: src/uk/ac/gla/terrier/indexing/CreateDocumentInitialWeightIndex.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/indexing/CreateDocumentInitialWeightIndex.java,v
retrieving revision 1.23
diff -w -u -r1.23 CreateDocumentInitialWeightIndex.java
--- src/uk/ac/gla/terrier/indexing/CreateDocumentInitialWeightIndex.java	28 Jan 2009 20:16:46 -0000	1.23
+++ src/uk/ac/gla/terrier/indexing/CreateDocumentInitialWeightIndex.java	26 Feb 2009 16:11:47 -0000
@@ -25,7 +25,6 @@
  */
 package uk.ac.gla.terrier.indexing;
 import java.io.DataOutputStream;
-import java.io.File;
 import java.io.IOException;
 import java.util.Arrays;
 
@@ -33,17 +32,17 @@
 
 import uk.ac.gla.terrier.matching.models.languagemodel.LanguageModel;
 import uk.ac.gla.terrier.structures.CollectionStatistics;
-import uk.ac.gla.terrier.structures.Index;
 import uk.ac.gla.terrier.structures.DirectIndex;
 import uk.ac.gla.terrier.structures.DocumentIndex;
+import uk.ac.gla.terrier.structures.Index;
 import uk.ac.gla.terrier.structures.InvertedIndex;
 import uk.ac.gla.terrier.structures.Lexicon;
 import uk.ac.gla.terrier.structures.indexing.DocumentInitialWeightIndex;
 import uk.ac.gla.terrier.structures.indexing.TermEstimateIndex;
+import uk.ac.gla.terrier.utility.ApplicationSetup;
 import uk.ac.gla.terrier.utility.Files;
 import uk.ac.gla.terrier.utility.Rounding;
 import uk.ac.gla.terrier.utility.TerrierTimer;
-import uk.ac.gla.terrier.utility.ApplicationSetup;
 /**
  * This class creates the initial weight index of all
  * documents in the collection. This is done for 
@@ -63,7 +62,7 @@
 	protected InvertedIndex invIndex;
 	
 	/** The Lexicon for retrieval. */
-	protected Lexicon lexicon;
+	protected Lexicon<String> lexicon;
 	
 	/** The DirectIndex for retrieval. */
 	protected DirectIndex directIndex;
@@ -89,16 +88,9 @@
 	
 	/** The data structure of the term esitmates. */
 	protected TermEstimateIndex teIndex;
-	/**
-	 * The default constructor of CreateDocumentInitialWeightIndex.
-	 * @param modelName The name of the applied language model.
-	 */
-	public CreateDocumentInitialWeightIndex(String modelName) 
-	{
-		this(Index.createIndex(), modelName);
-	}
 
-	public CreateDocumentInitialWeightIndex(Index i, String modelName) {
+
+	public CreateDocumentInitialWeightIndex(Index i, String modelName) throws IOException {
 		long startLoading = System.currentTimeMillis();
 		docIndex = i.getDocumentIndex();
 		lexicon = i.getLexicon();
@@ -148,9 +140,9 @@
 		TerrierTimer timer1 = new TerrierTimer();
 		timer1.start();
 		double[] TF = new double[(int)numberOfUniqueTerms];
-		for (int i = 0; i < numberOfUniqueTerms; i++){
-			lexicon.findTerm(i);
-			TF[i] = (double)lexicon.getTF();
+		for (int i = 0; i < numberOfUniqueTerms; i++)
+		{
+			TF[i] = (double)lexicon.getLexiconEntry(i).getValue().getFrequency();
 		}
 		timer1.setBreakPoint();
 		if(logger.isDebugEnabled()) {
Index: src/uk/ac/gla/terrier/indexing/CreateTermEstimateIndex.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/indexing/CreateTermEstimateIndex.java,v
retrieving revision 1.23
diff -w -u -r1.23 CreateTermEstimateIndex.java
--- src/uk/ac/gla/terrier/indexing/CreateTermEstimateIndex.java	28 Jan 2009 20:16:46 -0000	1.23
+++ src/uk/ac/gla/terrier/indexing/CreateTermEstimateIndex.java	26 Feb 2009 16:11:47 -0000
@@ -25,8 +25,9 @@
  */
 package uk.ac.gla.terrier.indexing;
 import java.io.DataOutputStream;
-import java.io.File;
 import java.io.IOException;
+import java.util.Iterator;
+import java.util.Map;
 
 import org.apache.log4j.Logger;
 
@@ -36,11 +37,11 @@
 import uk.ac.gla.terrier.structures.Index;
 import uk.ac.gla.terrier.structures.InvertedIndex;
 import uk.ac.gla.terrier.structures.Lexicon;
-import uk.ac.gla.terrier.structures.indexing.TermEstimateIndex;
+import uk.ac.gla.terrier.structures.LexiconEntry;
+import uk.ac.gla.terrier.utility.ApplicationSetup;
 import uk.ac.gla.terrier.utility.Files;
 import uk.ac.gla.terrier.utility.Rounding;
 import uk.ac.gla.terrier.utility.TerrierTimer;
-import uk.ac.gla.terrier.utility.ApplicationSetup;
 /**
  * This class creates the term estimate index of all terms in vocabulary. This is
  * done for language modeling approach.
@@ -121,9 +122,10 @@
 	 * Create the TermEstimateIndex. It computes the average term generation probability for each term in the vocabulary of the collection.
 	 *
 	 */
+	@SuppressWarnings("unchecked")
 	public void createTermEstimateIndex(){
 		TerrierTimer timer = null;
-		long numberOfUniqueTerms = collectionStatistics.getNumberOfUniqueTerms();
+		int numberOfUniqueTerms = collectionStatistics.getNumberOfUniqueTerms();
 		if(logger.isInfoEnabled()){
 		logger.info("number of unique terms: " + numberOfUniqueTerms);
 		logger.info("Creating TermEstimateIndex...");
@@ -133,10 +135,15 @@
 			timer.setTotalNumber((double)numberOfUniqueTerms);
 			timer.start();
 		}
-		termEstimates = new double[(int)numberOfUniqueTerms];
-		for (int i = 0; i < numberOfUniqueTerms; i++){
-			lexicon.seekEntry(i);
-			int[][] pointers = invIndex.getDocuments(i);
+		termEstimates = new double[numberOfUniqueTerms];
+		
+		Iterator<Map.Entry<String,LexiconEntry>> lexiconStream = 
+			(Iterator<Map.Entry<String,LexiconEntry>>)index.getIndexStructureInputStream("lexicon");
+		int i=0;
+		while(lexiconStream.hasNext())
+		{
+			Map.Entry<String,LexiconEntry> lee = lexiconStream.next();
+			int[][] pointers = invIndex.getDocuments(lee.getValue());
 			int[] docids = pointers[0];
 			int[] tf = pointers[1];
 			double[] docLength = new double[tf.length];
@@ -148,17 +155,18 @@
 			if(logger.isDebugEnabled()){
 				if ((i+1) % 10000 == 0){
 					timer.setRemainingTime((i+1));
-					logger.debug("term: " + lexicon.getTerm() +
-							", TF: " + lexicon.getTF() +", " +
+					logger.debug("term: " + lee.getKey() +
+							", TF: " + lee.getValue().getFrequency() +", " +
 							Rounding.toString((double)(i+1)/numberOfUniqueTerms*100, 2) +
 							"% finished, time remaining: " + timer.toStringMinutesSeconds());
 				}
 			}
+			i++;
 		}
 		try{
 			DataOutputStream output = new DataOutputStream(
 					Files.writeFileStream(INDEX_FILENAME));
-			for (int i = 0; i < termEstimates.length; i++)
+			for (i = 0; i < termEstimates.length; i++)
 				output.writeDouble(termEstimates[i]);
 			output.close();
 		}
Index: src/uk/ac/gla/terrier/indexing/Indexer.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/indexing/Indexer.java,v
retrieving revision 1.44
diff -w -u -r1.44 Indexer.java
--- src/uk/ac/gla/terrier/indexing/Indexer.java	28 Jan 2009 20:16:46 -0000	1.44
+++ src/uk/ac/gla/terrier/indexing/Indexer.java	26 Feb 2009 16:11:47 -0000
@@ -33,17 +33,17 @@
 import org.apache.log4j.Logger;
 
 import uk.ac.gla.terrier.structures.Index;
+import uk.ac.gla.terrier.structures.IndexUtil;
 import uk.ac.gla.terrier.structures.indexing.DirectIndexBuilder;
 import uk.ac.gla.terrier.structures.indexing.DocumentIndexBuilder;
 import uk.ac.gla.terrier.structures.indexing.InvertedIndexBuilder;
 import uk.ac.gla.terrier.structures.indexing.LexiconBuilder;
 import uk.ac.gla.terrier.structures.merging.BlockStructureMerger;
 import uk.ac.gla.terrier.structures.merging.StructureMerger;
-import uk.ac.gla.terrier.terms.TermPipeline;
 import uk.ac.gla.terrier.terms.SkipTermPipeline;
+import uk.ac.gla.terrier.terms.TermPipeline;
 import uk.ac.gla.terrier.utility.ApplicationSetup;
 import uk.ac.gla.terrier.utility.FieldScore;
-import uk.ac.gla.terrier.utility.Files;
 /**
  * <B>Properties:</b>
  * <ul>
@@ -63,15 +63,6 @@
 	/** the logger for this class */
 	protected static Logger logger = Logger.getRootLogger();
 
-	protected static String[] indexFileSuffices = new String[]{
-					ApplicationSetup.PROPERTIES_SUFFIX,
-					ApplicationSetup.IFSUFFIX,
-					ApplicationSetup.DF_SUFFIX,
-					ApplicationSetup.LEXICON_INDEX_SUFFIX,
-					ApplicationSetup.LEXICONSUFFIX,
-					ApplicationSetup.DOC_INDEX_SUFFIX,
-					ApplicationSetup.LEXICON_HASH_SUFFIX};
-
 	protected boolean UTFIndexing = false;
 
 	/**
@@ -322,11 +313,10 @@
 		}
 		else
 		{
-			final String src = path + ApplicationSetup.FILE_SEPARATOR + prefix;
-			final String dest = path + ApplicationSetup.FILE_SEPARATOR + oldIndexPrefix;
-			for (String suffix: indexFileSuffices)
-			{
-				Files.rename(src+suffix, dest+suffix);
+			try{
+				IndexUtil.renameIndex(path, prefix, path, oldIndexPrefix);
+			} catch (IOException ioe ) {
+				logger.error("Could not rename index", ioe);
 			}
 		}
 		//restore the prefix
@@ -374,18 +364,13 @@
 										  
 		sMerger.setNumberOfBits(FieldScore.FIELDS_COUNT);
 		sMerger.mergeStructures();
-		
-		String separator = ApplicationSetup.FILE_SEPARATOR;
 		src1.close(); src2.close(); dst.close();
 		//delete old indices  
-		for(String suffix : indexFileSuffices)
-		{
-			Files.delete(index1[0]+separator+index1[1]+ suffix);
-		}
-
-		for(String suffix : indexFileSuffices)
-        {
-            Files.delete(index2[0]+separator+index2[1]+ suffix);
+		try{
+			IndexUtil.deleteIndex(index1[0], index1[1]);
+			IndexUtil.deleteIndex(index2[0], index2[1]);
+		} catch (IOException ioe) {
+			logger.warn("Could not delete merge input indices ", ioe);
         }
 	}
 
@@ -414,11 +399,10 @@
 		logger.info("Done merging");
 		
 		//rename the generated structures 
-		String src = mpath + ApplicationSetup.FILE_SEPARATOR + mprefix+"_"+ (counterMerged-1);
-		String dest = mpath + ApplicationSetup.FILE_SEPARATOR + mprefix;
-		for (String suffix: indexFileSuffices)
-		{
-			Files.rename(src+suffix, dest+suffix);
+		try{
+			IndexUtil.renameIndex(mpath, mprefix+"_"+ (counterMerged-1), mpath, mprefix);
+		} catch (IOException ioe) {
+			logger.error("Could not rename merged index", ioe);
 		}
 	}
 
Index: src/uk/ac/gla/terrier/indexing/hadoop/Hadoop_BasicSinglePassIndexer.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/indexing/hadoop/Hadoop_BasicSinglePassIndexer.java,v
retrieving revision 1.5
diff -w -u -r1.5 Hadoop_BasicSinglePassIndexer.java
--- src/uk/ac/gla/terrier/indexing/hadoop/Hadoop_BasicSinglePassIndexer.java	16 Feb 2009 21:43:02 -0000	1.5
+++ src/uk/ac/gla/terrier/indexing/hadoop/Hadoop_BasicSinglePassIndexer.java	26 Feb 2009 16:11:47 -0000
@@ -30,9 +30,9 @@
 import java.io.DataInputStream;
 import java.io.DataOutputStream;
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.LinkedList;
-import java.util.ArrayList;
 
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
@@ -50,10 +50,12 @@
 import uk.ac.gla.terrier.compression.BitOutputStream;
 import uk.ac.gla.terrier.indexing.BasicSinglePassIndexer;
 import uk.ac.gla.terrier.indexing.Document;
+import uk.ac.gla.terrier.structures.BasicLexiconEntry;
 import uk.ac.gla.terrier.structures.DocumentIndexInputStream;
 import uk.ac.gla.terrier.structures.FilePosition;
 import uk.ac.gla.terrier.structures.Index;
 import uk.ac.gla.terrier.structures.LexiconOutputStream;
+import uk.ac.gla.terrier.structures.MapFileLexiconOutputStream;
 import uk.ac.gla.terrier.structures.indexing.DocumentIndexBuilder;
 import uk.ac.gla.terrier.structures.indexing.DocumentPostingList;
 import uk.ac.gla.terrier.structures.indexing.singlepass.FieldPostingInRun;
@@ -63,8 +65,8 @@
 import uk.ac.gla.terrier.structures.indexing.singlepass.hadoop.HadoopRunWriter;
 import uk.ac.gla.terrier.structures.indexing.singlepass.hadoop.HadoopRunsMerger;
 import uk.ac.gla.terrier.structures.indexing.singlepass.hadoop.MapData;
-import uk.ac.gla.terrier.structures.indexing.singlepass.hadoop.MapEmittedTerm;
 import uk.ac.gla.terrier.structures.indexing.singlepass.hadoop.MapEmittedPostingList;
+import uk.ac.gla.terrier.structures.indexing.singlepass.hadoop.MapEmittedTerm;
 import uk.ac.gla.terrier.structures.indexing.singlepass.hadoop.SimpleDocumentIndexBuilder;
 import uk.ac.gla.terrier.utility.ApplicationSetup;
 import uk.ac.gla.terrier.utility.FieldScore;
@@ -348,7 +350,7 @@
 	 */
 	
 	/** OutputStream for the Lexicon*/ 
-	protected LexiconOutputStream lexstream;
+	protected LexiconOutputStream<String> lexstream;
 	/** runIterator factory being used to generate RunIterators */
 	protected HadoopRunIteratorFactory runIteratorF = null;
 	/** records whether the reduce() has been called for the first time */
@@ -434,11 +436,11 @@
 	 * flushed.
 	 * @param mapData - info about the runs(maps) and the flushes
 	 */
-	public void startReduce(LinkedList<MapData> mapData)
+	public void startReduce(LinkedList<MapData> mapData) throws IOException
 	{
 		logger.info("The number of Reduce Tasks being used : "+jc.getNumReduceTasks());
 		((HadoopRunsMerger)(super.merger)).beginMerge(mapData);
-		lexstream = createLexiconOutputStream(currentIndex.getPath(), currentIndex.getPrefix());
+		lexstream = new MapFileLexiconOutputStream(this.currentIndex, "lexicon", BasicLexiconEntry.Factory.class);
 		// Tell the merger how many to Reducers to merge for
 		((HadoopRunsMerger) merger).setNumReducers(jc.getNumReduceTasks());
 	}
@@ -523,13 +525,13 @@
 		currentIndex.addIndexStructure(
 				"inverted",
 				invertedIndexClass,
-				"uk.ac.gla.terrier.structures.Lexicon,java.lang.String,java.lang.String",
-				"lexicon,path,prefix");
+				"uk.ac.gla.terrier.structures.Index,java.lang.String", 
+				"index,structureName");
 		currentIndex.addIndexStructureInputStream(
 	            "inverted",
-	            invertedIndexInputStreamClass,
-	            "java.lang.String,java.lang.String,uk.ac.gla.terrier.structures.LexiconInputStream",
-	            "path,prefix,lexicon-inputstream");
+                "uk.ac.gla.terrier.structures.InvertedIndexInputStream",
+                "uk.ac.gla.terrier.structures.Index,java.lang.String,java.util.Iterator",
+                "index,structureName,lexicon-inputstream");
 		currentIndex.setIndexProperty("num.inverted.fields.bits", ""+FieldScore.FIELDS_COUNT );
 		
 		//3. document index
@@ -546,17 +548,13 @@
 		//4. close the map phase indices
 		for(Index i : sourceIndices)
 		{
-			String path = i.getPath();
-			String prefix = i.getPrefix();
 			i.close();
 		}
 		//5. finalise the lexicon
-		int numTerms;
-		currentIndex.setIndexProperty("num.Terms",""+ (numTerms = lexstream.getNumberOfTermsWritten()) );
+		currentIndex.setIndexProperty("num.Terms",""+ lexstream.getNumberOfTermsWritten() );
 		currentIndex.setIndexProperty("num.Tokens",""+lexstream.getNumberOfTokensWritten() );
 		currentIndex.setIndexProperty("num.Pointers",""+lexstream.getNumberOfPointersWritten() );
 		lexstream.close();
-		this.createLexicon(numTerms);
 		this.finishedInvertedIndexBuild();
 		currentIndex.flush();
 	}
@@ -573,7 +571,7 @@
 		try{
 			tempRM.setBos(new BitOutputStream(
 					currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR 
-					+ currentIndex.getPrefix() + ApplicationSetup.IFSUFFIX ));
+					+ currentIndex.getPrefix() + ".inverted.bf"));
 		} catch (IOException ioe) {
 			ioe.printStackTrace();
 		}
Index: src/uk/ac/gla/terrier/indexing/hadoop/Hadoop_BlockSinglePassIndexer.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/indexing/hadoop/Hadoop_BlockSinglePassIndexer.java,v
retrieving revision 1.3
diff -w -u -r1.3 Hadoop_BlockSinglePassIndexer.java
--- src/uk/ac/gla/terrier/indexing/hadoop/Hadoop_BlockSinglePassIndexer.java	28 Jan 2009 20:16:47 -0000	1.3
+++ src/uk/ac/gla/terrier/indexing/hadoop/Hadoop_BlockSinglePassIndexer.java	26 Feb 2009 16:11:47 -0000
@@ -265,7 +265,7 @@
 		try{
 			tempRM.setBos(new BitOutputStream(
 					currentIndex.getPath() + ApplicationSetup.FILE_SEPARATOR
-					+ currentIndex.getPrefix() + ApplicationSetup.IFSUFFIX ));
+					+ currentIndex.getPrefix() + ".inverted.bf" ));
 		} catch (IOException ioe) {
 			ioe.printStackTrace();
 		}
Index: src/uk/ac/gla/terrier/matching/LMMatching.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/matching/LMMatching.java,v
retrieving revision 1.29
diff -w -u -r1.29 LMMatching.java
--- src/uk/ac/gla/terrier/matching/LMMatching.java	28 Jan 2009 20:16:47 -0000	1.29
+++ src/uk/ac/gla/terrier/matching/LMMatching.java	26 Feb 2009 16:11:47 -0000
@@ -32,6 +32,7 @@
 import uk.ac.gla.terrier.matching.models.languagemodel.LanguageModel;
 import uk.ac.gla.terrier.matching.tsms.TermScoreModifier;
 import uk.ac.gla.terrier.structures.Index;
+import uk.ac.gla.terrier.structures.LexiconEntry;
 import uk.ac.gla.terrier.structures.indexing.DocumentInitialWeightIndex;
 import uk.ac.gla.terrier.structures.indexing.TermEstimateIndex;
 import uk.ac.gla.terrier.utility.HeapSort;
@@ -165,10 +166,10 @@
 		final int queryLength = queryTermStrings.length;
 		for (int i = 0; i < queryLength; i++) {
 			//we seek the query term in the lexicon
-			boolean found = lexicon.findTerm(queryTermStrings[i]);
-			//and if it is not found, we continue with the next term
-			if (!found)
+			LexiconEntry le = lexicon.getLexiconEntry(queryTermStrings[i]);
+			if (le == null)
 				continue;
+			
 			//because when the TreeNode is created, the term
 			//code assigned is taken from
 			//the TermCodes class, the assigned term code is
@@ -176,31 +177,31 @@
 			//process. Therefore, at this point, the term
 			//code should be updated with the one
 			//stored in the lexicon file.	
-			queryTerms.setTermProperty(queryTermStrings[i], lexicon.getTermId());
+			queryTerms.setTermProperty(queryTermStrings[i], le.getTermId());
 			if(logger.isDebugEnabled()){
-				logger.debug("" + (i + 1) + ": " + queryTermStrings[i].trim() + "(" + lexicon.getTermId() + ")");
+				logger.debug("" + (i + 1) + ": " + queryTermStrings[i].trim() + "(" + le.getTermId() + ")");
 			}
 			//the weighting model is prepared for assigning scores to documents
-			wmodel.setTermFrequency((double)lexicon.getTF());
-			this.termFrequency[i] = (double)lexicon.getTF();
-			this.termEstimates[i] = this.termEstimateIndex.getTermEstimateByTermid(lexicon.getTermId());
+			wmodel.setTermFrequency((double)le.getFrequency());
+			this.termFrequency[i] = (double)le.getFrequency();
+			this.termEstimates[i] = this.termEstimateIndex.getTermEstimateByTermid(le.getTermId());
 			if(logger.isDebugEnabled()){
 				logger.debug(
 					" with "
-						+ lexicon.getNt()
+						+ le.getDocumentFrequency()
 						+ " documents (TF is "
-						+ lexicon.getTF()
+						+ le.getFrequency()
 						+ ").");
 			}
 			//check if the IDF is very low.
 			if(logger.isInfoEnabled()){
-				if (IGNORE_LOW_IDF_TERMS==true && docIndex.getNumberOfDocuments() < lexicon.getTF()) {
+				if (IGNORE_LOW_IDF_TERMS==true && docIndex.getNumberOfDocuments() < le.getFrequency()) {
 					logger.info("query term " + queryTermStrings[i] + " has low idf - ignored from scoring.");
 					continue;
 				}
 			}
 			//the postings are beign read from the inverted file.
-			pointers = invertedIndex.getDocuments(queryTerms.getTermCode(queryTermStrings[i]));
+			pointers = invertedIndex.getDocuments(le);
 			
 			init_tf(i, pointers);
 			
Index: src/uk/ac/gla/terrier/matching/Matching.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/matching/Matching.java,v
retrieving revision 1.62
diff -w -u -r1.62 Matching.java
--- src/uk/ac/gla/terrier/matching/Matching.java	28 Jan 2009 20:16:47 -0000	1.62
+++ src/uk/ac/gla/terrier/matching/Matching.java	26 Feb 2009 16:11:47 -0000
@@ -114,7 +114,7 @@
 	/** The document index used.*/
 	protected DocumentIndex docIndex;
 	/** The lexicon used.*/
-	protected Lexicon lexicon;
+	protected Lexicon<String> lexicon;
 	/** The inverted file.*/
 	protected InvertedIndex invertedIndex;
 	/** The collection statistics */
@@ -334,17 +334,18 @@
 			//the TermCodes class, the assigned term code is only valid during the indexing
 			//process. Therefore, at this point, the term code should be updated with the one
 			//stored in the lexicon file.	
-			queryTerms.setTermProperty(queryTermStrings[i], lEntry.termId);
+			queryTerms.setTermProperty(queryTermStrings[i], lEntry.getTermId());
 			//the weighting model is prepared for assigning scores to documents
 			wmodel.setKeyFrequency(queryTerms.getTermWeight(queryTermStrings[i]));
-			wmodel.setDocumentFrequency((double)lEntry.n_t);
-			wmodel.setTermFrequency((double)lEntry.TF);
+			wmodel.setDocumentFrequency((double)lEntry.getDocumentFrequency());
+			wmodel.setTermFrequency((double)lEntry.getFrequency());
 			
-			logger.debug((i + 1) + ": " + queryTermStrings[i].trim() + " with " + lEntry.n_t + " documents (TF is " + lEntry.TF + ").");
+			logger.debug((i + 1) + ": " + queryTermStrings[i].trim() + " with " + lEntry.getDocumentFrequency() 
+					+ " documents (TF is " + lEntry.getFrequency() + ").");
 
 
 			//check if the IDF is very low.
-			if (IGNORE_LOW_IDF_TERMS && docIndex.getNumberOfDocuments() < lEntry.TF) {
+			if (IGNORE_LOW_IDF_TERMS && docIndex.getNumberOfDocuments() < lEntry.getFrequency()) {
 				logger.debug("query term " + queryTermStrings[i] + " has low idf - ignored from scoring.");
 				continue;
 			}
Index: src/uk/ac/gla/terrier/matching/MatchingQueryTerms.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/matching/MatchingQueryTerms.java,v
retrieving revision 1.24
diff -w -u -r1.24 MatchingQueryTerms.java
--- src/uk/ac/gla/terrier/matching/MatchingQueryTerms.java	28 Jan 2009 20:16:47 -0000	1.24
+++ src/uk/ac/gla/terrier/matching/MatchingQueryTerms.java	26 Feb 2009 16:11:47 -0000
@@ -34,6 +34,7 @@
 import uk.ac.gla.terrier.matching.dsms.DocumentScoreModifier;
 import uk.ac.gla.terrier.matching.tsms.TermScoreModifier;
 import uk.ac.gla.terrier.querying.parser.Query;
+import uk.ac.gla.terrier.structures.TermStatistics;
 /**
  * Models a query used for matching documents. It is created
  * by creating an instance of this class, and then passing it as
@@ -44,11 +45,13 @@
  * @author Vassilis Plachouras, Craig Macdonald.
  * @version $Revision: 1.24 $
  */
-public class MatchingQueryTerms implements Serializable,Cloneable{
-		
+public class MatchingQueryTerms implements Serializable,Cloneable
+{
+	private static final long serialVersionUID = -9134975387300425203L;
 	/** The weight and the modifiers associated with a query term.*/
-	protected static class QueryTermProperties implements Serializable{
-		
+	protected static class QueryTermProperties implements Serializable
+	{
+		private static final long serialVersionUID = 6327392687128896557L;
 		
 		/** The weight of a query term. This is usually how many times the term occurred
 		  * in the query, but sometime may be altered if a weight has been specified on the
@@ -56,8 +59,8 @@
 		  * on the unparsed query (example <tt>term1 term2^3</tt>). */
 		double weight;
 		
-		/** The term code (identifier) of the query term.*/
-		int termCode;
+		/** Info about the query term.*/
+		TermStatistics stats;
 		
 		/** The term score modifiers associated with a particular query term.*/
 		ArrayList<TermScoreModifier> modifiers = new ArrayList<TermScoreModifier>();
@@ -70,8 +73,8 @@
 		 * of a query term.
 		 * @param code int the term code of a query term. 
 		 */
-		public QueryTermProperties(int code) {
-			termCode = code;
+		public QueryTermProperties(TermStatistics _stats) {
+			stats = _stats;
 		}
 		
 		/** 
@@ -106,9 +109,9 @@
 		 * @param w double the weight of a query term. 
 		 * @param code int the term code of a query term. 
 		 */
-		public QueryTermProperties(double w, int code) {
+		public QueryTermProperties(double w, TermStatistics _stats) {
 			weight = w;
-			termCode = code;
+			stats = _stats;
 		}
 		
 		/**
@@ -117,9 +120,9 @@
 		 * @param tsm TermScoreModifier the modifier associated with a query term.
 		 * @param code int the term code of a query term. 
 		 */
-		public QueryTermProperties(TermScoreModifier tsm, int code) {
+		public QueryTermProperties(TermScoreModifier tsm, TermStatistics _stats) {
 			modifiers.add(tsm);
-			termCode = code;
+			stats = _stats;
 		}
 		
 		/**
@@ -128,15 +131,15 @@
 		 * @param tsm TermScoreModifier the modifier associated with a query term.
 		 * @param code int the term code of a query term. 
 		 */
-		public QueryTermProperties(double w, TermScoreModifier tsm, int code) {
+		public QueryTermProperties(double w, TermScoreModifier tsm, TermStatistics _stats) {
 			weight = w;
 			modifiers.add(tsm);
-			termCode = code;
+			stats = _stats;
 		}
 
 		public Object clone()
 		{
-			QueryTermProperties newO = new QueryTermProperties(weight, termCode);
+			QueryTermProperties newO = new QueryTermProperties(weight, stats);
 			for (TermScoreModifier tsm : modifiers)
 				newO.modifiers.add((TermScoreModifier)(tsm.clone()));
 			return (Object)newO;
@@ -144,7 +147,7 @@
 
 		public int hashCode()
 		{
-			int hashCodeValue = termCode;
+			int hashCodeValue = stats.hashCode();
 			hashCodeValue += (new Double(weight)).hashCode();
 			for (TermScoreModifier tsm : modifiers)
 			{
@@ -279,12 +282,12 @@
 	 * @param term String the term for which the term identifier is set.
 	 * @param code int the term identifier.
 	 */
-	public void setTermProperty(String term, int code) {
+	public void setTermProperty(String term, TermStatistics stats) {
 		QueryTermProperties properties = termProperties.get(term);
 		if (properties == null) {
-			termProperties.put(term, new QueryTermProperties(code));
+			termProperties.put(term, new QueryTermProperties(stats));
 		} else {
-			properties.termCode = code;
+			properties.stats = stats;
 		}
 	}
 	
@@ -364,11 +367,9 @@
 	 * @return int the term code of the given query term, or -1 if the term
 	 *         does not appear in the query.
 	 */
-	public int getTermCode(String term) {
-		QueryTermProperties tp = (QueryTermProperties)termProperties.get(term);
-		if (tp!=null)
-			return tp.termCode;
-		return -1;
+	public TermStatistics getStatistics(String term) {
+		QueryTermProperties tp = termProperties.get(term);
+		return tp.stats;
 	}
 	
 	/** 
@@ -379,7 +380,7 @@
 	 *         of the query. 
 	 */
 	public TermScoreModifier[] getTermScoreModifiers(String term) {
-		QueryTermProperties tp = (QueryTermProperties)termProperties.get(term);
+		QueryTermProperties tp = termProperties.get(term);
 		if (tp!=null)
 			return (TermScoreModifier[])tp.modifiers.toArray(tmpTSM);
 		return null;
Index: src/uk/ac/gla/terrier/matching/dsms/BlockScoreModifier.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/matching/dsms/BlockScoreModifier.java,v
retrieving revision 1.18
diff -w -u -r1.18 BlockScoreModifier.java
--- src/uk/ac/gla/terrier/matching/dsms/BlockScoreModifier.java	28 Jan 2009 20:16:48 -0000	1.18
+++ src/uk/ac/gla/terrier/matching/dsms/BlockScoreModifier.java	26 Feb 2009 16:11:47 -0000
@@ -65,7 +65,7 @@
 		if (invertedIndex instanceof BlockInvertedIndex && 
 				query.length() > 1 && query.length() < 5) {
 			
-			Lexicon lexicon = index.getLexicon();
+			Lexicon<String> lexicon = index.getLexicon();
 			
 			int[] docids = resultSet.getDocids();
 			double[] scores = resultSet.getScores();
@@ -118,7 +118,7 @@
 					continue;
 				//double term1KeyFrequency = query.getTermWeight(term1);
 				
-				double term1DocumentFrequency = (double)tEntry1.n_t;
+				double term1DocumentFrequency = (double)tEntry1.getDocumentFrequency();
 				
 				//we seek the 2nd query term in the lexicon
 				LexiconEntry tEntry2 = lexicon.getLexiconEntry(term2);
@@ -126,7 +126,7 @@
 				if (tEntry1 == null)
 					continue;
 				//double term2KeyFrequency = query.getTermWeight(term2);
-				double term2DocumentFrequency = (double)tEntry2.n_t;
+				double term2DocumentFrequency = (double)tEntry2.getDocumentFrequency();
 				term1Pointers = invertedIndex.getDocuments(tEntry1);
 				
 				term1docids = term1Pointers[0];
Index: src/uk/ac/gla/terrier/matching/dsms/PhraseScoreModifier.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/matching/dsms/PhraseScoreModifier.java,v
retrieving revision 1.22
diff -w -u -r1.22 PhraseScoreModifier.java
--- src/uk/ac/gla/terrier/matching/dsms/PhraseScoreModifier.java	28 Jan 2009 20:16:48 -0000	1.22
+++ src/uk/ac/gla/terrier/matching/dsms/PhraseScoreModifier.java	26 Feb 2009 16:11:47 -0000
@@ -26,22 +26,22 @@
  */
 package uk.ac.gla.terrier.matching.dsms;
 
-import gnu.trove.TIntArrayList;
 import gnu.trove.TIntIntHashMap;
 
 import java.util.ArrayList;
 import java.util.Arrays;
 
+import org.apache.log4j.Logger;
+
 import uk.ac.gla.terrier.matching.MatchingQueryTerms;
 import uk.ac.gla.terrier.matching.ResultSet;
 import uk.ac.gla.terrier.querying.parser.SingleTermQuery;
 import uk.ac.gla.terrier.structures.BlockInvertedIndex;
 import uk.ac.gla.terrier.structures.Index;
 import uk.ac.gla.terrier.structures.InvertedIndex;
+import uk.ac.gla.terrier.structures.LexiconEntry;
 import uk.ac.gla.terrier.utility.ApplicationSetup;
 
-import org.apache.log4j.Logger;
-
 /**
  * Modifies the scores of the documents which contain, or do not contain a given
  * phrase.
@@ -186,14 +186,15 @@
 		for (int i = 0; i < phraseLength; i++) {
 			docidsMap[i] = new TIntIntHashMap();
 			String t = ((SingleTermQuery) phraseTerms.get(i)).getTerm();
-			if (terms.getTermCode(t) == -1) {
-				index.getLexicon().findTerm(t);
-				int termCode = index.getLexicon().getTermId();
-				terms.setTermProperty(t, termCode);
+			if (terms.getStatistics(t) == null)
+			{
+				LexiconEntry le = index.getLexicon().getLexiconEntry(t);
+				if (le == null)
+					continue;
+				terms.setTermProperty(t, le);
 			}
 
-			int termCode = terms.getTermCode(t);
-			if (termCode != -1) {
+			
 				//for each phrase term, we store the identifiers of
 				//documents that contain that term in a hashmap
 				//we also convert the block frequencies into
@@ -204,7 +205,7 @@
 				//For j-th document in the postings lists postings[i]
 				//the positions start at postings[i][4][postings[i][3][j-1]]
 				//and end at postings[i][4][postings[i][3][j]-1]
-				postings[i] = invIndex.getDocuments(terms.getTermCode(t));
+			postings[i] = invIndex.getDocuments((LexiconEntry)terms.getStatistics(t));
 
 				for (int j = 0; j < postings[i][0].length; j++) {
 					//note that the entries in the docidsMap hash sets have
@@ -213,7 +214,6 @@
 					if (j > 0)
 						postings[i][3][j] += postings[i][3][j - 1];
 				}
-			}
 
 		}
 		try {
Index: src/uk/ac/gla/terrier/structures/BlockInvertedIndex.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/BlockInvertedIndex.java,v
retrieving revision 1.32
diff -w -u -r1.32 BlockInvertedIndex.java
--- src/uk/ac/gla/terrier/structures/BlockInvertedIndex.java	28 Jan 2009 20:16:54 -0000	1.32
+++ src/uk/ac/gla/terrier/structures/BlockInvertedIndex.java	26 Feb 2009 16:11:47 -0000
@@ -41,17 +41,8 @@
 	protected int DocumentBlockCountDelta = 1;
 	protected BlockInvertedIndex() {}
 
-	/**
-	 * Creates an instance of the BlockInvertedIndex class 
-	 * using the given lexicon.
-	 * @param lexicon The lexicon used for retrieval
-	 */
-	public BlockInvertedIndex(Lexicon lexicon) {
-		super(lexicon);
-	}
-
-	public BlockInvertedIndex(Lexicon lexicon, String path, String prefix) {
-		super(lexicon, path, prefix);
+	public BlockInvertedIndex(Index index, String structureName) {
+		super(index, structureName);
 	}
 	
 	/**
@@ -60,8 +51,8 @@
 	 * @param lexicon The lexicon used for retrieval
 	 * @param filename the name of the inverted file
 	 */
-	public BlockInvertedIndex(Lexicon lexicon, String filename) {
-		super(lexicon, filename);
+	public BlockInvertedIndex(String filename) {
+		super(filename);
 	}
 
 	/** let it know which index to use */
@@ -70,38 +61,7 @@
 		DocumentBlockCountDelta = i.getIntIndexProperty("blocks.invertedindex.countdelta", 1);
 	}
 
-	/**
-	 * Prints out the block inverted index file.
-	 */
-	public void print() {
-		for (int i = 0; i < lexicon.getNumberOfLexiconEntries(); i++) {
-			lexicon.findTerm(i);
-			System.out.print("Term ("+lexicon.getTerm()+","+i+") : ");
-			int[][] documents = getDocuments(i);
-			int blockindex = 0;
-			for (int j = 0; j < documents[0].length; j++) {
-				System.out.print(
-					"("
-						+ documents[0][j]
-						+ ", "
-						+ documents[1][j]
-						+ ", ");
-				if (FieldScore.USE_FIELD_INFORMATION)
-				{
-					System.out.print(documents[2][j]
-					+ ", ");
-				}
-				System.out.print( documents[3][j]);
 				
-				for (int k = 0; k < documents[3][j]; k++) {
-					System.out.print(", B" + documents[4][blockindex]);
-					blockindex++;
-				}
-				System.out.print(")");
-			}
-			System.out.println();
-		}
-	}
 	/**
 	 * Returns a 2D array containing the document ids, 
 	 * the term frequencies, the field scores the block frequencies and 
@@ -118,7 +78,11 @@
 	 * @param df the number of postings to expect 
 	 */
 
-	public int[][] getDocuments(final long startOffset, final byte startBitOffset, final long endOffset, final byte endBitOffset, final int df) {
+	public int[][] getDocuments(BitIndexPointer pointer) {
+		
+		final long startOffset = pointer.getBytes();
+		final byte startBitOffset = pointer.getBits();
+		final int df = pointer.getNumberOfEntries();
 		
 		final int fieldCount = FieldScore.FIELDS_COUNT;
 		final boolean loadTagInformation = FieldScore.USE_FIELD_INFORMATION;
@@ -131,8 +95,7 @@
 		final TIntArrayList blockids = new TIntArrayList(df); //ideally we'd have TF here
 
 		try{
-		
-			final BitIn file = this.file.readReset(startOffset, startBitOffset, endOffset, endBitOffset);
+			final BitIn file = this.file.readReset(startOffset, startBitOffset);
 	
 			if (loadTagInformation) { //if there are tag information to process
 				//documentTerms[2] = new int[df]; 
@@ -196,247 +159,4 @@
 			return null;
 		}
 	}
-
-
-	//* @param termid the id of the term whose documents we are looking for.
-	//public int[][] getDocuments(int termid) {
-	/*public int[][] getDocuments(final long startOffset, final byte startBitOffset, final long endOffset, final byte endBitOffset, int df) {
-
-		//boolean found = lexicon.findTerm(termid);
-		final byte startBitOffset = lexicon.getStartBitOffset();
-		final long startOffset = lexicon.getStartOffset();
-		final byte endBitOffset = lexicon.getEndBitOffset();
-		final long endOffset = lexicon.getEndOffset();
-
-		final int FIELDS_COUNT = FieldScore.FIELDS_COUNT;
-
-		// TODO use heuristics here like we do in InvertedIndex.java
-		 // for setting a good guess of the arraylist sizes. 
-		TIntArrayList temporaryDocids = new TIntArrayList();
-		TIntArrayList temporaryTFs = new TIntArrayList();
-		TIntArrayList temporaryFields = new TIntArrayList();
-		TIntArrayList temporaryBlockFreq = new TIntArrayList();
-		TIntArrayList temporaryBlockIds = new TIntArrayList();
-		int previousDocid = -1;
-			
-		//ArrayList temporaryTerms = new ArrayList();
-		//ArrayList temporaryBlockids = new ArrayList();
-		//int blockcount = 0;
-		try{
-			final BitIn file = this.file.readReset(startOffset, startBitOffset, endOffset, endBitOffset);
-			//boolean hasMore = false;
-			while (((file.getByteOffset() + startOffset) < endOffset)
-				|| (((file.getByteOffset() + startOffset) == endOffset)
-					&& (file.getBitOffset() < endBitOffset))) {
-	
-				temporaryDocids.add(previousDocid = file.readGamma() + previousDocid);
-				temporaryTFs.add(file.readUnary());
-				temporaryFields.add(file.readBinary(FIELDS_COUNT));
-				
-				/*int docId = file.readGamma();
-				/int[] tmp = new int[4];
-				tmp[0] = docId;
-				tmp[1] = file.readUnary();
-				tmp[2] = file.readBinary(FIELDS_COUNT);
-				
-				final int blockfreq = file.readUnary();
-				temporaryBlockFreq.add(blockfreq);
-				//tmp[3] = blockfreq;
-				//System.out.print("docid="+previousDocid + "blockfreq="+blockfreq);
-	
-				int[] tmp2 = new int[blockfreq];
-				int previousBlockId = -1;
-				//System.out.print(" blocks=");
-				for (int i = 0; i < blockfreq; i++) {
-					tmp2[i] = previousBlockId = file.readGamma() + previousBlockId;
-					 //System.out.print(previousBlockId + ",");
-					//blockcount++;
-				}
-				// System.out.println("");
-				//temporaryTerms.add(tmp);
-				//temporaryBlockids.add(tmp2);
-				temporaryBlockIds.add(tmp2);
-			}
-			int[][] documentTerms = new int[5][];
-			documentTerms[0] = temporaryDocids.toNativeArray(); //new int[temporaryTerms.size()];
-			documentTerms[1] = temporaryTFs.toNativeArray(); //new int[temporaryTerms.size()];
-			documentTerms[2] = temporaryFields.toNativeArray(); //new int[temporaryTerms.size()];
-			documentTerms[3] = temporaryBlockFreq.toNativeArray(); //new int[temporaryTerms.size()];
-			documentTerms[4] =	temporaryBlockIds.toNativeArray(); //new int[blockcount];
-			/*
-			documentTerms[0][0] = ((int[]) temporaryTerms.get(0))[0] - 1;
-			documentTerms[1][0] = ((int[]) temporaryTerms.get(0))[1];
-			documentTerms[2][0] = ((int[]) temporaryTerms.get(0))[2];
-			documentTerms[3][0] = ((int[]) temporaryTerms.get(0))[3];
-			int[] blockids = ((int[]) temporaryBlockids.get(0));
-			documentTerms[4][0] = blockids[0] - 1;
-			for (int i = 1; i < blockids.length; i++) {
-				documentTerms[4][i] = blockids[i] + documentTerms[4][i - 1];
-			}
-			int blockindex = blockids.length;
-			if (documentTerms[0].length > 1) {
-				for (int i = 1; i < documentTerms[0].length; i++) {
-					int[] tmpMatrix = (int[]) temporaryTerms.get(i);
-					documentTerms[0][i] = tmpMatrix[0] + documentTerms[0][i - 1];
-					documentTerms[1][i] = tmpMatrix[1];
-					documentTerms[2][i] = tmpMatrix[2];
-					documentTerms[3][i] = tmpMatrix[3];
-					blockids = ((int[]) temporaryBlockids.get(i));
-					documentTerms[4][blockindex] = blockids[0] - 1;
-					blockindex++;
-					for (int j = 1; j < blockids.length; j++) {
-						documentTerms[4][blockindex] =
-							blockids[j] + documentTerms[4][blockindex - 1];
-						blockindex++;
-					}
-				}
-			}
-			return documentTerms;
-		}catch (IOException ioe) {
-			logger.error("Problem reading direct index", ioe);
-			return null;
-		}
-	}*/
-
-
-	/*public int[][] getDocumentsWithoutBlocks(int termid, int startDocid, int endDocid) {
-		if (! lexicon.findTerm(termid))
-			return null;
-	
-		byte startBitOffset = lexicon.getStartBitOffset();
-		long startOffset = lexicon.getStartOffset();
-		byte endBitOffset = lexicon.getEndBitOffset();
-		long endOffset = lexicon.getEndOffset();
-		// TODO use heuristics here like we do in InvertedIndex.java
-		// for setting a good guess of the arraylist sizes. 
-		ArrayList<int[]> temporaryTerms = new ArrayList<int[]>();
-		//int blockcount = 0;
-		try{
-			final BitIn file = this.file.readReset(startOffset, startBitOffset, endOffset, endBitOffset);
-			//boolean hasMore = false;
-			final int fieldCount = FieldScore.FIELDS_COUNT;
-			while (((file.getByteOffset() + startOffset) < endOffset)
-					|| (((file.getByteOffset() + startOffset) == endOffset)
-					&& (file.getBitOffset() < endBitOffset))) {
-				int docId = file.readGamma();
-				int[] tmp = new int[3];
-				tmp[0] = docId;
-				tmp[1] = file.readUnary();
-				tmp[2] = file.readBinary(fieldCount);
-			 
-				//read the blocks, but dont save them
-				int blockfreq = file.readUnary();
-				for (int i = 0; i < blockfreq; i++) {
-					file.readGamma();
-				 }
-				if (docId >= startDocid && docId <=endDocid){
-					temporaryTerms.add(tmp);		
-				}
-			}
-			int[][] documentTerms = new int[3][];
-			if (temporaryTerms.size()>0){
-				documentTerms[0] = new int[temporaryTerms.size()];
-				documentTerms[1] = new int[temporaryTerms.size()];
-				documentTerms[2] = new int[temporaryTerms.size()];
-	 
-				documentTerms[0][0] = ((int[]) temporaryTerms.get(0))[0] - 1;
-				documentTerms[1][0] = ((int[]) temporaryTerms.get(0))[1];
-				documentTerms[2][0] = ((int[]) temporaryTerms.get(0))[2];
-		 
-				if (documentTerms[0].length > 1) {
-					for (int i = 1; i < documentTerms[0].length; i++) {
-						int[] tmpMatrix = (int[]) temporaryTerms.get(i);
-						documentTerms[0][i] = tmpMatrix[0] + documentTerms[0][i - 1];
-						documentTerms[1][i] = tmpMatrix[1];
-						documentTerms[2][i] = tmpMatrix[2];
-			 		}
-				}
-			}
-			return documentTerms;
-		} catch (IOException ioe) {
-			logger.error("Problem reading direct index", ioe);
-			return null;
-		}
-	}
-	*/
-	public int[][] getDocuments(int termid) {
-		 LexiconEntry lEntry = lexicon.getLexiconEntry(termid);
-		if (lEntry == null)
-			return null;
-		return getDocuments(lEntry.startOffset,
-			lEntry.startBitOffset,
-			lEntry.endOffset,
-			lEntry.endBitOffset, lEntry.n_t);
-	}
-	public int[][] getDocumentsWithoutBlocks(int termid) {
-		LexiconEntry lEntry = lexicon.getLexiconEntry(termid);
-		if (lEntry == null)
-			return null;
-		return getDocumentsWithoutBlocks(lEntry.startOffset,
-			lEntry.startBitOffset,
-			lEntry.endOffset,
-			lEntry.endBitOffset, lEntry.n_t);
-	}
-
-	public int[][] getDocumentsWithoutBlocks(LexiconEntry lEntry)
-	{
-		return getDocumentsWithoutBlocks(
-			lEntry.startOffset,
-			lEntry.startBitOffset,
-			lEntry.endOffset,
-			lEntry.endBitOffset, lEntry.n_t);
-	}
-
-	public int[][] getDocumentsWithoutBlocks(long startOffset,  byte startBitOffset, long endOffset, byte endBitOffset, int df)
-	{	
-		int[][] documentTerms = null;
-		try{
-			final BitIn file = this.file.readReset(startOffset, startBitOffset, endOffset, endBitOffset);
-			final int fieldCount = FieldScore.FIELDS_COUNT;
-			 final boolean loadTagInformation = FieldScore.USE_FIELD_INFORMATION;
- 			if (loadTagInformation) { //if there are tag information to process		 
-				documentTerms = new int[3][df];
-				documentTerms[0][0] = file.readGamma() - 1;
-				documentTerms[1][0] = file.readUnary();
-				documentTerms[2][0] = file.readBinary(fieldCount);
-				//read the blocks, but dont save them
-				int blockfreq = file.readUnary() - DocumentBlockCountDelta;
-				for (int j = 0; j < blockfreq; j++) {
-					file.readGamma();
-				 }
-				for (int i = 1; i < df; i++) {
-					documentTerms[0][i]  = file.readGamma() + documentTerms[0][i - 1];
-					documentTerms[1][i]  = file.readUnary();
-					documentTerms[2][i]  = file.readBinary(fieldCount);
-					//read the blocks, but dont save them
-			   		blockfreq = file.readUnary() - DocumentBlockCountDelta;
-					for (int j = 0; j < blockfreq; j++) {
-						file.readGamma();
-				 	}
-				}
-			} else { //no tag information to process					
-				documentTerms = new int[2][df];
-				documentTerms[0][0] = file.readGamma() - 1;
-				documentTerms[1][0] = file.readUnary();
-				//read the blocks, but dont save them
-				int blockfreq = file.readUnary() - DocumentBlockCountDelta;
-				for (int j = 0; j < blockfreq; j++) {
-					file.readGamma();
-				 }
-				for(int i = 1; i < df; i++){
-					documentTerms[0][i] = file.readGamma() + documentTerms[0][i - 1];
-					documentTerms[1][i] = file.readUnary();
-					//read the blocks, but dont save them
-					blockfreq = file.readUnary() - DocumentBlockCountDelta;
-					for (int j = 0; j < blockfreq; j++) {
-						file.readGamma();
-				 	}
-				}
-			}
-			return documentTerms;
-		} catch (IOException ioe) {
-			logger.error("Problem reading inverted index", ioe);
-			return null;
-		}
-	}
 }
Index: src/uk/ac/gla/terrier/structures/BlockInvertedIndexInputStream.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/BlockInvertedIndexInputStream.java,v
retrieving revision 1.4
diff -w -u -r1.4 BlockInvertedIndexInputStream.java
--- src/uk/ac/gla/terrier/structures/BlockInvertedIndexInputStream.java	28 Jan 2009 20:16:54 -0000	1.4
+++ src/uk/ac/gla/terrier/structures/BlockInvertedIndexInputStream.java	26 Feb 2009 16:11:47 -0000
@@ -26,10 +26,12 @@
  */
 package uk.ac.gla.terrier.structures;
 
-import java.io.IOException;
-import uk.ac.gla.terrier.structures.LexiconInputStream;
 import gnu.trove.TIntArrayList;
-import uk.ac.gla.terrier.compression.BitIn;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Map;
+
 import uk.ac.gla.terrier.utility.FieldScore;
 
 /** Reads a BlockInvertedIndex as a stream
@@ -37,38 +39,18 @@
   * @since 2.0
   * @version $Revision: 1.4 $
   */
-public class BlockInvertedIndexInputStream extends InvertedIndexInputStream implements IndexConfigurable 
+public class BlockInvertedIndexInputStream extends InvertedIndexInputStream 
 {
     protected int DocumentBlockCountDelta = 1;
-	/** Make a new BlockInvertedIndexInputStream from the specified path/prefix combo. The LexiconInputStream
-	  * is required to determine the offsets and the document frequency - ie number of postings for
- 	  * each term. */
-	public BlockInvertedIndexInputStream(String path, String prefix, LexiconInputStream lis) throws IOException
-	{
-		super(path, prefix, lis);
-	}
-	
-	/** Make a new BlockInvertedIndexInputStream from the specified filename. The LexiconInputStream
-	  * is required to determine the offsets and the document frequency - ie number of postings for
- 	  * each term.
-	  * @param filename Location of the inverted file to open */
-	public BlockInvertedIndexInputStream(String filename, LexiconInputStream lis) throws IOException
-	{
-		super(filename, lis);
-	}
-
-	public BlockInvertedIndexInputStream(BitIn invFile, LexiconInputStream lis) throws IOException
-	{
-		super(invFile, lis);
-	}
 
-    /** let it know which index to use */
-    public void setIndex(Index i)
+    public BlockInvertedIndexInputStream(Index _index, String structureName, Iterator<Map.Entry<?, ? extends BitIndexPointer>> positions) throws IOException
     {
-        DocumentBlockCountDelta = i.getIntIndexProperty("blocks.invertedindex.countdelta", 1);
+    	super(_index, structureName, positions);
+    	DocumentBlockCountDelta = _index.getIntIndexProperty("blocks.invertedindex.countdelta", 1);
     }
 
-	protected int[][] getNextDocuments(int df, long endByteOffset, byte endBitOffset) throws IOException {
+    protected int[][] getNextDocuments(BitIndexPointer pointer) throws IOException {
+    	final int df = pointer.getNumberOfEntries();
 		final int fieldCount = FieldScore.FIELDS_COUNT;
 		final boolean loadTagInformation = FieldScore.USE_FIELD_INFORMATION;
 		
@@ -144,7 +126,7 @@
 		try{
 		while((documents = getNextDocuments()) != null)
 		{
-			System.out.print("tid"+i);
+			System.out.print(i+"th term:");
 			int blockindex = 0;
 			for (int j = 0; j < documents[0].length; j++) {
 				System.out.print(
Index: src/uk/ac/gla/terrier/structures/ExpansionTerms.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/ExpansionTerms.java,v
retrieving revision 1.40
diff -w -u -r1.40 ExpansionTerms.java
--- src/uk/ac/gla/terrier/structures/ExpansionTerms.java	28 Jan 2009 20:16:55 -0000	1.40
+++ src/uk/ac/gla/terrier/structures/ExpansionTerms.java	26 Feb 2009 16:11:47 -0000
@@ -31,6 +31,8 @@
 import gnu.trove.TIntHashSet;
 import gnu.trove.TIntObjectHashMap;
 
+import java.util.Map;
+
 import org.apache.log4j.Logger;
 
 import uk.ac.gla.terrier.matching.MatchingQueryTerms;
@@ -55,7 +57,7 @@
 	/** The terms in the top-retrieval documents. */
 	protected TIntObjectHashMap<ExpansionTerm> terms;
 	/** The lexicon used for retrieval. */
-	protected Lexicon lexicon;
+	protected Lexicon<String> lexicon;
 	/** The number of documents in the collection. */
 	protected int numberOfDocuments;
 	/** The number of tokens in the collection. */
@@ -167,7 +169,7 @@
 	* @param totalLength The sum of the length of the top-retrieved documents.
 	* @param lexicon Lexicon The lexicon used for retrieval.
  	*/
-	public ExpansionTerms(CollectionStatistics collStats, double totalLength, Lexicon lexicon) {
+	public ExpansionTerms(CollectionStatistics collStats, double totalLength, Lexicon<String> lexicon) {
 		this(
 				collStats.getNumberOfDocuments(),
 				collStats.getNumberOfTokens(),
@@ -186,7 +188,7 @@
 			long numberOfTokens,
 			double averageDocumentLength,
 			double totalLength, 
-			Lexicon lexicon) {
+			Lexicon<String> lexicon) {
 		this.numberOfDocuments = numberOfDocuments;
 		this.numberOfTokens = numberOfTokens;
 		this.averageDocumentLength = averageDocumentLength;
@@ -255,10 +257,10 @@
 				}
 				
 				double TF = 0;
-				double Nt = 0;
-				lexicon.findTerm(allTerms[i].getTermID());
-				TF = lexicon.getTF();
-				Nt = lexicon.getNt();
+				//double Nt = 0;
+				TermStatistics ts = lexicon.getLexiconEntry(allTerms[i].getTermID()).getValue();
+				TF = ts.getFrequency();
+				//Nt = ts.getDocumentFrequency();
 				allTerms[i].setWeightExpansion(QEModel.score(
 					allTerms[i].getWithinDocumentFrequency(),
 					TF
@@ -283,9 +285,9 @@
 				logger.info("parameter free query expansion.");
 			}
 		}
-		lexicon.findTerm(allTerms[posMaxWeight].termID);
 		if(logger.isDebugEnabled()){
-		logger.debug("term with the maximum weight: " + lexicon.getTerm() +
+			String term = lexicon.getLexiconEntry(allTerms[posMaxWeight].termID).getKey();
+			logger.debug("term with the maximum weight: " + term +
 				", normaliser: " + Rounding.toString(normaliser, 4));
 		}
 		THashSet<SingleTermQuery> expandedTerms = new THashSet<SingleTermQuery>();
@@ -303,8 +305,8 @@
 					allTerms[i] = temp;
 				}
 				
-				lexicon.findTerm(allTerms[i].getTermID());
-				final SingleTermQuery expandedTerm = new SingleTermQuery(lexicon.getTerm());//new TermTreeNode(lexicon.getTerm());
+				String term = lexicon.getLexiconEntry(allTerms[i].getTermID()).getKey();
+				final SingleTermQuery expandedTerm = new SingleTermQuery(term);
 				
 				expandedTerm.setWeight(allTerms[i].getWeightExpansion()/normaliser);
 				
@@ -323,11 +325,11 @@
 				if (weighedOriginalTermsCount==originalTerms.size())
 					break;
 				
-				lexicon.findTerm(allTerms[i].getTermID());
-				if (!originalTerms.contains(lexicon.getTerm()))
+				String term = lexicon.getLexiconEntry(allTerms[i].getTermID()).getKey();
+				if (!originalTerms.contains(term))
 					continue;
 				weighedOriginalTermsCount++;
-				final SingleTermQuery expandedTerm = new SingleTermQuery(lexicon.getTerm());//new TermTreeNode(lexicon.getTerm());
+				final SingleTermQuery expandedTerm = new SingleTermQuery(term);
 				expandedTerm.setWeight(allTerms[i].getWeightExpansion()/normaliser);
 				//expandedTerms[i].normalisedFrequency = 
 				//terms[i].getWeightExpansion()/normaliser;
@@ -350,7 +352,7 @@
 		this.originalTerms.clear();
 		for (int i=0; i<terms.length; i++){
 			this.originalTerms.add(terms[i]);
-			this.originalTermids.add(query.getTermCode(terms[i]));
+			this.originalTermids.add(query.getStatistics(terms[i]).getTermId());
 		}
 	}
 
@@ -367,9 +369,9 @@
 	 * @param model QueryExpansionModel the used query expansion model.
 	 * @return double the weight of the specified term.
 	 */
-	public double getExpansionWeight(String term, QueryExpansionModel model){
-		lexicon.findTerm(term);
-		return this.getExpansionWeight(lexicon.termId, model);
+	public double getExpansionWeight(String term, QueryExpansionModel model)
+	{
+		return this.getExpansionWeight(lexicon.getLexiconEntry(term).getTermId(), model);
 	}
 	
 	/**
@@ -377,9 +379,9 @@
 	 * @param term String the term to get the weight for.
 	 * @return double the weight of the specified term.
 	 */
-	public double getExpansionWeight(String term){
-		lexicon.findTerm(term);
-		return this.getExpansionWeight(lexicon.termId);
+	public double getExpansionWeight(String term)
+	{
+		return this.getExpansionWeight(lexicon.getLexiconEntry(term).getTermId());
 	}
 	/**
 	 * Returns the un-normalised weight of a given term.
@@ -396,8 +398,7 @@
 	 * @return double the frequency of the specified term in the top-ranked documents.
 	 */
 	public double getFrequency(String term){
-		lexicon.findTerm(term);
-		return this.getFrequency(lexicon.getTermId());
+		return this.getFrequency(lexicon.getLexiconEntry(term).getTermId());
 	}
 	
 	/**
@@ -452,10 +453,11 @@
 				}
 				
 				double TF = 0;
-				double Nt = 0;
-				lexicon.findTerm(allTerms[i].getTermID());
-				TF = lexicon.getTF();
-				Nt = lexicon.getNt();
+				//double Nt = 0;
+				LexiconEntry le = lexicon.getLexiconEntry(allTerms[i].getTermID()).getValue();
+				
+				TF = le.getFrequency();
+				//Nt = le.getDocumentFrequency();
 				allTerms[i].setWeightExpansion(QEModel.score(
 					allTerms[i].getWithinDocumentFrequency(),
 					TF
@@ -479,9 +481,10 @@
 				logger.info("parameter free query expansion.");
 			}
 		}
-		lexicon.findTerm(allTerms[posMaxWeight].termID);
+		
 		if(logger.isDebugEnabled()){
-			logger.debug("term with the maximum weight: " + lexicon.getTerm() +
+			String term = lexicon.getLexiconEntry(allTerms[posMaxWeight].termID).getKey();
+			logger.debug("term with the maximum weight: " + term +
 				", normaliser: " + Rounding.toString(normaliser, 4));
 		}
 		for (int i = 0; i < len; i++){
@@ -508,10 +511,11 @@
 		if (o != null)
 		{
 			double TF = 0;
-			double Nt = 0;
-			lexicon.findTerm(termId);
-			TF = lexicon.getTF();
-			Nt = lexicon.getNt();
+			//double Nt = 0;
+			Map.Entry<String,LexiconEntry> lse = lexicon.getLexiconEntry(termId);
+			TF = lse.getValue().getFrequency();
+			//Nt = lse.getValue().getDocumentFrequency();
+			
 			score = model.score(((ExpansionTerm)o).getWithinDocumentFrequency(),
 					TF,
 					this.totalDocumentLength,
Index: src/uk/ac/gla/terrier/structures/FilePosition.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/FilePosition.java,v
retrieving revision 1.12
diff -w -u -r1.12 FilePosition.java
--- src/uk/ac/gla/terrier/structures/FilePosition.java	28 Jan 2009 20:16:55 -0000	1.12
+++ src/uk/ac/gla/terrier/structures/FilePosition.java	26 Feb 2009 16:11:47 -0000
@@ -38,7 +38,7 @@
  * a bit offset of 2.
  * @author Craig Macdonald, Vassilis Plachouras &amp; John Kane
  */
-public class FilePosition
+public class FilePosition implements BitFilePosition
 {
 	/** The number of bytes a file position could be converted to
 	 * - 8 for the byte's long, 1 for the bits
@@ -83,6 +83,15 @@
 		Bits = in.Bits;
 	}
 
+	public long getBytes() { return Bytes; }
+	public byte getBits() { return Bits; }
+	
+	public void setPosition(long bytes, byte bits)
+	{
+		Bytes = bytes;
+		Bits = bits;
+	}
+
 	/** How large is this object when serialized */	
 	public static int sizeInBytes()
 	{
Index: src/uk/ac/gla/terrier/structures/Index.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/Index.java,v
retrieving revision 1.35
diff -w -u -r1.35 Index.java
--- src/uk/ac/gla/terrier/structures/Index.java	28 Jan 2009 20:16:55 -0000	1.35
+++ src/uk/ac/gla/terrier/structures/Index.java	26 Feb 2009 16:11:47 -0000
@@ -377,7 +377,9 @@
 			String structureClassName = properties.getProperty("index."+structureName+".class");
 			if (structureClassName == null)
 			{
-				logger.error("This index ("+this.toString()+") doesnt have an index structure called "+ structureName);
+				logger.error("This index ("+this.toString()+") doesnt have an index structure called "+ structureName 
+						+ ": property index."+structureName+".class not found");
+				logger.error(properties.toString());
 				return null;//TODO exceptions?
 			}
 			//obtain the class definition for the index structure
@@ -385,7 +387,8 @@
 			try{
 				indexStructureClass = Class.forName(structureClassName, false, this.getClass().getClassLoader());
 			} catch (ClassNotFoundException cnfe) {
-				logger.error("This index ("+this.toString()+") references an unknown index structure class: "+structureName+ " looking for "+ structureClassName);
+				logger.error("ClassNotFoundException: This index ("+this.toString()+") references an unknown index structure class: "+structureName+ " looking for "+ structureClassName);
+				cnfe.printStackTrace();
 				return null;//TODO exceptions?
 			}
 
@@ -416,8 +419,20 @@
 						objs[i] = prefix;
 					else if (p.equals("index"))
 						objs[i] = this;
+					else if (p.equals("structureName"))
+					{
+						final String tmp = structureName;
+						objs[i] = tmp.replaceAll("-inputstream$", "");
+					}
 					else if (p.endsWith("-inputstream"))//no caching for input streams
 						 objs[i] = loadIndexStructure(p);
+					else if (p.matches("^\\$\\{.+\\}$"))
+					{
+						String propertyName = p.substring(2,p.length()-1);
+						objs[i] = properties.getProperty(propertyName, ApplicationSetup.getProperty("max.term.length", ""+20));
+						if (objs[i] == null)
+							throw new IllegalArgumentException("Property "+propertyName+" not found");
+					}
 					else
 						objs[i] = getIndexStructure(p);
 					i++;
@@ -492,7 +507,7 @@
 			}
 			try{
 				final OutputStream outputStream = Files.writeFileStream(propertiesFilename); 
-				properties.store(outputStream,"");
+				properties.store(outputStream,this.toString());
 				outputStream.close(); 
 			} catch (IOException ioe) {
 				logger.warn("Could not write to index properties at "+propertiesFilename + " - some changes may be lost", ioe);
@@ -517,9 +532,10 @@
 		return (DirectIndex)getIndexStructure("direct");
 	}
 	/** Return the Lexicon associated with this index */
-	public Lexicon getLexicon()
+	@SuppressWarnings("unchecked")
+	public Lexicon<String> getLexicon()
 	{
-		return (Lexicon)getIndexStructure("lexicon");
+		return (Lexicon<String>)getIndexStructure("lexicon");
 	}
 	/** Return the DocumentIndex associated with this index */
 	public DocumentIndex getDocumentIndex()
Index: src/uk/ac/gla/terrier/structures/InvertedIndex.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/InvertedIndex.java,v
retrieving revision 1.40
diff -w -u -r1.40 InvertedIndex.java
--- src/uk/ac/gla/terrier/structures/InvertedIndex.java	28 Jan 2009 20:16:55 -0000	1.40
+++ src/uk/ac/gla/terrier/structures/InvertedIndex.java	26 Feb 2009 16:11:47 -0000
@@ -27,14 +27,13 @@
  */
 package uk.ac.gla.terrier.structures;
 import java.io.IOException;
-import java.util.ArrayList;
+
 import org.apache.log4j.Logger;
 
-import uk.ac.gla.terrier.compression.BitFile;
+import uk.ac.gla.terrier.compression.BitFileBuffered;
 import uk.ac.gla.terrier.compression.BitIn;
 import uk.ac.gla.terrier.compression.BitInSeekable;
 import uk.ac.gla.terrier.compression.OldBitFile;
-import uk.ac.gla.terrier.utility.ApplicationSetup;
 import uk.ac.gla.terrier.utility.FieldScore;
 /**
  * This class implements the inverted index 
@@ -67,43 +66,30 @@
 	/** Filename of the open file */
 	protected String filename;
 	
-	/**
-	 * The lexicon used for retrieving documents.
-	 */
-	protected Lexicon lexicon;
-
 	/** A constructor for child classes that doesnt open the file */
 	protected InvertedIndex(long a, long b, long c) { }
 
+	
+
 	/** A default constructor, only for use by child classes */
 	protected InvertedIndex()
 	{
 	
 	}
 
-	public InvertedIndex(Lexicon lexicon, String path, String prefix)
+	public InvertedIndex(Index index, String structureName)
 	{
-		this(lexicon, path + ApplicationSetup.FILE_SEPARATOR + prefix + ApplicationSetup.IFSUFFIX);
+		this(index.getPath() + "/" + index.getPrefix() + "." + structureName + ".bf");
 	}
 	
 	/**
-	 * Creates an instance of the HtmlInvertedIndex class using the lexicon.
-	 * @param lexicon The lexicon used for retrieval
-	 */
-	public InvertedIndex(Lexicon lexicon) {
-		this(lexicon, ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX);
-		//file = new BitFile(ApplicationSetup.INVERTED_FILENAME, "r");
-		//this.lexicon = lexicon;
-	}
-	/**
 	 * Creates an instance of the HtmlInvertedIndex class using the given
 	 * lexicon.
 	 * @param lexicon The lexicon used for retrieval
 	 * @param filename The name of the inverted file
 	 */
-	public InvertedIndex(Lexicon lexicon, String filename) {
-		file = new BitFile(this.filename = filename, "r");
-		this.lexicon = lexicon;
+	public InvertedIndex(String filename) {
+		file = new BitFileBuffered(this.filename = filename);
 	}
 	/** forces the data structure to reopen the underlying bitfile
 	 *  using the legacy implementation of BitFile (OldBitFile)
@@ -115,349 +101,49 @@
 		file = new OldBitFile(filename, "r");
 	}
 	
-	/**
-	 * Prints out the inverted index file.
-	 */
-	public void print() {
-		for (int i = 0; i < lexicon.getNumberOfLexiconEntries(); i++) {
-			int[][] documents = getDocuments(i);
-			System.out.print("tid"+i);
-			if (useFieldInformation) {
-				for (int j = 0; j < documents[0].length; j++) {
-					System.out.print("(" + documents[0][j] + ", " + documents[1][j]
-							+ ", F" + documents[2][j] + ") ");
-				}
-				System.out.println();				
-			} else {
-				for (int j = 0; j < documents[0].length; j++) {
-					System.out.print("(" + documents[0][j] + ", " 
-										 + documents[1][j] + ") ");
-				}
-				System.out.println();
-			}
-		}
-	}
-
-	public int[][] getDocuments(LexiconEntry lEntry) {
-		if (lEntry==null)
-			return null;
-		return getDocuments(lEntry.startOffset, 
-			lEntry.startBitOffset, 
-			lEntry.endOffset, 
-			lEntry.endBitOffset, lEntry.n_t);
-	}
-	/**
-	 * Returns a two dimensional array containing the document ids, term
-	 * frequencies and field scores for the given documents. 	  
-	 * @return int[][] the two dimensional [3][n] array containing the n 
-	 *		 document identifiers, frequencies and field scores. If fields is not enabled, then size is [2][n].
-	 * @param termid the identifier of the term whose documents we are looking for.
-	 */
-	public int[][] getDocuments(int termid) {
-		 LexiconEntry lEntry = lexicon.getLexiconEntry(termid);
-		if (lEntry == null)
-			return null;
-		return getDocuments(lEntry.startOffset,
-			lEntry.startBitOffset,
-			lEntry.endOffset,
-			lEntry.endBitOffset, lEntry.n_t);
+	public void print()
+	{
+		//TODO
+		throw new UnsupportedOperationException("InvIndex.print() is missing");
 	}
 	
-/**
-	 * Returns a two dimensional array containing the document ids, term
-	 * frequencies and field scores for the given documents. 	  
-	 * @return int[][] the two dimensional [3][n] array containing the n 
-	 *		 document identifiers, frequencies and field scores. If fields is not enabled, then size is [2][n].
-	 * @param sOffset start byte of the postings in the inverted file
-	 * @param sBitOffset start bit of the postings in the inverted file
-	 * @param eOffset end byte of the postings in the inverted file
-	 * @param eBitOffset end bit of the postings in the inverted file
-	 */
-	
-	public int[][] getDocuments(long sOffset, byte sBitOffset, long eOffset, byte eBitOffset, int df) {
 		
-		final byte startBitOffset = sBitOffset;
-		final long startOffset = sOffset;
-		final byte endBitOffset = eBitOffset;
-		final long endOffset = eOffset;
+	public int[][] getDocuments(BitIndexPointer pointer) {
+		if (pointer==null)
+			return null;
 		final int fieldCount = FieldScore.FIELDS_COUNT;
 		final boolean loadTagInformation = FieldScore.USE_FIELD_INFORMATION;
-		//int df = lexicon.getNt();
+		final int count = pointer.getNumberOfEntries();
 		try{
+			final BitIn file = this.file.readReset(pointer.getBytes(), pointer.getBits());
 			int[][] documentTerms = null;
-			final BitIn file = this.file.readReset(startOffset, startBitOffset, endOffset, endBitOffset);		
 			if (loadTagInformation) { //if there are tag information to process			
-				documentTerms = new int[3][df];
+				documentTerms = new int[3][count];
 				documentTerms[0][0] = file.readGamma() - 1;
 				documentTerms[1][0] = file.readUnary();
 				documentTerms[2][0] = file.readBinary(fieldCount);
-				for (int i = 1; i < df; i++) {					
+				for (int i = 1; i < count; i++) {					
 					documentTerms[0][i]  = file.readGamma() + documentTerms[0][i - 1];
 					documentTerms[1][i]  = file.readUnary();
 					documentTerms[2][i]  = file.readBinary(fieldCount);
 				}				
 			} else { //no tag information to process					
-				documentTerms = new int[2][df];
+				documentTerms = new int[2][count];
 				//new		
 				documentTerms[0][0] = file.readGamma() - 1;
 				documentTerms[1][0] = file.readUnary();
-				for(int i = 1; i < df; i++){							 
+				for(int i = 1; i < count; i++){							 
 					documentTerms[0][i] = file.readGamma() + documentTerms[0][i - 1];
 					documentTerms[1][i] = file.readUnary();
 				}
 			}
+			file.close();
 			return documentTerms;
 		} catch (IOException ioe) {
 			logger.error("Problem reading inverted index", ioe);
 			return null;
 		}
-	}
 	
-	
-//	public int[][] getDocuments(long sOffset, byte sBitOffset, long eOffset, byte eBitOffset) {
-//	
-//		final byte startBitOffset = sBitOffset;
-//		final long startOffset = sOffset;
-//		final byte endBitOffset = eBitOffset;
-//		final long endOffset = eOffset;
-//		final int fieldCount = FieldScore.FIELDS_COUNT;
-//		final boolean loadTagInformation = FieldScore.USE_FIELD_INFORMATION;
-//	
-//		/* Coding is done separately for with Fields and without Fields, to keep
-//		 * if's out of loops. */	
-//		
-//		ArrayList temporaryTerms = null; //instantiate when we know roughly how big it should be
-//		int[][] documentTerms = null;
-//		file.readReset(startOffset, startBitOffset, endOffset, endBitOffset);
-//		//boolean hasMore = false;
-//		if (loadTagInformation) { //if there are tag information to process
-//			/* FIELD_LOAD_FACTOR provides a heuristical rough size need for the arraylist. */
-//			/* could probably do a better optimisation by considering the number of fields.*/
-//			//temporaryTerms = new ArrayList((int)((endOffset-startOffset)*FIELD_LOAD_FACTOR));
-//			TIntArrayList temporaryDocids = new TIntArrayList((int)((endOffset-startOffset)*NORMAL_LOAD_FACTOR));
-//			TIntArrayList temporaryTFs = new TIntArrayList((int)((endOffset-startOffset)*NORMAL_LOAD_FACTOR));
-//			TIntArrayList temporaryFields = new TIntArrayList((int)((endOffset-startOffset)*NORMAL_LOAD_FACTOR));
-//			int previousDocid = -1;
-//			
-//			while (((file.getByteOffset() + startOffset) < endOffset)
-//					|| (((file.getByteOffset() + startOffset) == endOffset) && (file
-//							.getBitOffset() < endBitOffset))) {
-//				//read document ID
-//				temporaryDocids.add(previousDocid = file.readGamma() + previousDocid);
-//				//read document frequency
-//				temporaryTFs.add(file.readUnary());
-//				//read fields bitset (fieldCount bits long)
-//				temporaryFields.add(file.readBinary(fieldCount));
-//		
-//				/*int[] tmp = new int[3];
-//				//read documnent ID
-//				tmp[0] = file.readGamma();
-//				//read document frequency
-//				tmp[1] = file.readUnary();
-//				//read fields bitset (fieldCount bits long) 
-//				tmp[2] = file.readBinary(fieldCount);
-//				temporaryTerms.add(tmp);*/
-//			}
-//			final int postingsListSize = temporaryDocids.size();
-//			documentTerms = new int[3][postingsListSize];
-//			temporaryDocids.toNativeArray(documentTerms[0], 0, postingsListSize);
-//			temporaryTFs.toNativeArray(documentTerms[1], 0, postingsListSize);
-//			temporaryFields.toNativeArray(documentTerms[2], 0, postingsListSize);	
-//			/*
-//			documentTerms = new int[3][temporaryTerms.size()];
-//			int[] tmpDocumentTerms0 = documentTerms[0];
-//			int[] tmpDocumentTerms1 = documentTerms[1];
-//			int[] tmpDocumentTerms2 = documentTerms[2];
-//			tmpDocumentTerms0[0] = ((int[]) temporaryTerms.get(0))[0] - 1;
-//			tmpDocumentTerms1[0] = ((int[]) temporaryTerms.get(0))[1];
-//			tmpDocumentTerms2[0] = ((int[]) temporaryTerms.get(0))[2];
-//			if (documentTerms[0].length > 1) {
-//				for (int i = 1; i < documentTerms[0].length; i++) {
-//					int[] tmpMatrix = (int[]) temporaryTerms.get(i);
-//					tmpDocumentTerms0[i] = tmpMatrix[0] + documentTerms[0][i - 1];
-//					tmpDocumentTerms1[i] = tmpMatrix[1];
-//					tmpDocumentTerms2[i] = tmpMatrix[2];
-//				}
-//			}
-//			*/		
-//		} else { //no tag information to process
-//			
-//			/* NORMAL_LOAD_FACTOR provides a heuristical rough size need for the arraylist */
-//			TIntArrayList temporaryDocids = new TIntArrayList((int)((endOffset-startOffset)*NORMAL_LOAD_FACTOR));
-//			TIntArrayList temporaryTFs = new TIntArrayList((int)((endOffset-startOffset)*NORMAL_LOAD_FACTOR));
-//			//temporaryTerms = new ArrayList((int)((endOffset-startOffset)*NORMAL_LOAD_FACTOR));
-//
-//			int previousDocid = -1;
-//			while (((file.getByteOffset() + startOffset) < endOffset)
-//					|| (((file.getByteOffset() + startOffset) == endOffset) && (file
-//							.getBitOffset() < endBitOffset))) {
-//				//read document ID
-//				temporaryDocids.add(previousDocid = file.readGamma() + previousDocid);
-//				//read document frequency
-//				temporaryTFs.add(file.readUnary());
-//				//int[] tmp = new int[2];
-//				//read document ID
-//				//tmp[0] = file.readGamma();
-//				//read document frequency
-//				//tmp[1] = file.readUnary();
-//				//temporaryTerms.add(tmp);
-//			}
-//
-//			final int postingsListSize = temporaryDocids.size(); /*temporaryTerms.size()*/
-//			documentTerms = new int[2][postingsListSize];
-//			temporaryDocids.toNativeArray(documentTerms[0], 0, postingsListSize);
-//			temporaryTFs.toNativeArray(documentTerms[1], 0, postingsListSize);
-//			//int last = -1;
-//			//int[] tmpDocumentTerms0 = documentTerms[0];
-//			//for(int i=0;i<postingsListSize;i++)
-//			//{
-//			//	last = tmpDocumentTerms0[i] = tmpDocumentTerms0[i] + last;
-//			//}
-//
-//			//int[] tmpDocumentTerms0 = documentTerms[0];
-//			//int[] tmpDocumentTerms1 = documentTerms[1];
-//			//tmpDocumentTerms0[0] = temporaryDocids.get(0);//((int[]) temporaryTerms.get(0))[0] - 1;
-//			//tmpDocumentTerms1[0] = temporaryTFs.get(0); //((int[]) temporaryTerms.get(0))[1];
-//			//if (documentTerms[0].length > 1) {
-//			//	for (int i = 1; i < documentTerms[0].length; i++) {
-//			//		last = tmpDocumentTerms0[i] = temporaryDocids.get(i) + last;
-//			//		tmpDocumentTerms1[i] = temporaryTFs.get(i);
-//					//int[] tmpMatrix = (int[]) temporaryTerms.get(i);
-//					//tmpDocumentTerms0[i] = tmpMatrix[0] + documentTerms[0][i - 1];
-//					//tmpDocumentTerms1[i] = tmpMatrix[1];
-//			//	}
-//			//}
-//		}
-//		//System.out.println((endOffset-startOffset)+" , "+temporaryTerms.size());
-//		return documentTerms;
-//	}
-	/* *
-	 * Returns a five dimensional array containing the document ids, 
-	 * the term frequencies, the field scores the block frequencies and 
-	 * the block ids for the given documents. The returned postings are
-	 * for the documents within a specified range of docids.
-	 * @return int[][] the five dimensional [5][] array containing 
-	 *		 the document ids, frequencies, field scores and block 
-	 *		 frequencies, while the last vector contains the 
-	 *		 block identifiers and it has a different length from 
-	 *		 the document identifiers.
-	 * @param termid the id of the term whose documents we are looking for.
-	 * @param startDocid The starting docid that will be returned.
-	 * @param endDocid The last possible docid that will be returned.
-	 */
-	/*public int[][] getDocuments(int termid, int startDocid, int endDocid) {
-		// Coding is done separately for with Fields and without Fields, to keep
-		  if's out of loops. 
-		boolean found = lexicon.findTerm(termid);
-		if (!found) 
-			return null;
-		
-		byte startBitOffset = lexicon.getStartBitOffset();
-		long startOffset = lexicon.getStartOffset();
-		byte endBitOffset = lexicon.getEndBitOffset();
-		long endOffset = lexicon.getEndOffset();
-		final int fieldCount = FieldScore.FIELDS_COUNT;
-		final boolean loadTagInformation = FieldScore.USE_FIELD_INFORMATION;
-		
-		ArrayList<int[]> temporaryTerms = null; //instantiate when we know roughly how big it should be
-		int[][] documentTerms = null;
-		try{
-			final BitIn file = this.file.readReset(startOffset, startBitOffset, endOffset, endBitOffset);
-			//boolean hasMore = false;
-			if (loadTagInformation) { //if there are tag information to process
-				// FIELD_LOAD_FACTOR provides a heuristical rough size need for the arraylist. 
-				// could probably do a better optimisation by considering the number of fields.
-				temporaryTerms = new ArrayList<int[]>((int)((endOffset-startOffset)*FIELD_LOAD_FACTOR));
-				while (((file.getByteOffset() + startOffset) < endOffset)
-						|| (((file.getByteOffset() + startOffset) == endOffset) && (file
-								.getBitOffset() < endBitOffset))) {
-					int[] tmp = new int[3];
-					//read documnent ID
-					tmp[0] = file.readGamma();
-					//read document frequency
-					tmp[1] = file.readUnary();
-					//read fields bitset (fieldCount bits long) 
-					tmp[2] = file.readBinary(fieldCount);
-					if (tmp[0]>=startDocid && tmp[0]<=endDocid)
-						temporaryTerms.add(tmp);
-				}
-				documentTerms = new int[3][temporaryTerms.size()];
-				int[] tmpDocumentTerms0 = documentTerms[0];
-				int[] tmpDocumentTerms1 = documentTerms[1];
-				int[] tmpDocumentTerms2 = documentTerms[2];
-				tmpDocumentTerms0[0] = ((int[]) temporaryTerms.get(0))[0] - 1;
-				tmpDocumentTerms1[0] = ((int[]) temporaryTerms.get(0))[1];
-				tmpDocumentTerms2[0] = ((int[]) temporaryTerms.get(0))[2];
-				if (documentTerms[0].length > 1) {
-					for (int i = 1; i < documentTerms[0].length; i++) {
-						int[] tmpMatrix = (int[]) temporaryTerms.get(i);
-						tmpDocumentTerms0[i] = tmpMatrix[0] + documentTerms[0][i - 1];
-						tmpDocumentTerms1[i] = tmpMatrix[1];
-						tmpDocumentTerms2[i] = tmpMatrix[2];
-					}
-				}			
-			} else { //no tag information to process
-				
-				//NORMAL_LOAD_FACTOR provides a heuristical rough size need for the arraylist 
-				temporaryTerms = new ArrayList<int[]>((int)((endOffset-startOffset)*NORMAL_LOAD_FACTOR));
-				while (((file.getByteOffset() + startOffset) < endOffset)
-						|| (((file.getByteOffset() + startOffset) == endOffset) && (file
-								.getBitOffset() < endBitOffset))) {
-					int[] tmp = new int[2];
-					//read document ID
-					tmp[0] = file.readGamma();
-					//read document frequency
-					tmp[1] = file.readUnary();
-					temporaryTerms.add(tmp);
-				}
-				documentTerms = new int[2][temporaryTerms.size()];
-				int[] tmpDocumentTerms0 = documentTerms[0];
-				int[] tmpDocumentTerms1 = documentTerms[1];
-				tmpDocumentTerms0[0] = ((int[]) temporaryTerms.get(0))[0] - 1;
-				tmpDocumentTerms1[0] = ((int[]) temporaryTerms.get(0))[1];
-				if (documentTerms[0].length > 1) {
-					for (int i = 1; i < documentTerms[0].length; i++) {
-						int[] tmpMatrix = (int[]) temporaryTerms.get(i);
-						tmpDocumentTerms0[i] = tmpMatrix[0] + documentTerms[0][i - 1];
-						tmpDocumentTerms1[i] = tmpMatrix[1];
-					}
-				}			
-			}
-		}
-		catch (IOException ioe) {
-			logger.error("Problem reading inverted index", ioe);
-			return null;
-		}
-		
-		return documentTerms;
-	}*/
-	
-	/**
-	 * Returns the information for a posting list in string format 
-	 */
-	public String getInfo(int term) {
-			StringBuilder info = new StringBuilder();					
-			int[][] documents = getDocuments(term);			
-			if (useFieldInformation) {
-				for (int j = 0; j < documents[0].length; j++) {
-					info.append("(");
-					info.append(documents[0][j]);
-					info.append(","); 
-					info.append(documents[1][j]);
-					info.append(",");
-					info.append(documents[2][j]);
-					info.append(")");
-				}							
-			} else {
-				for (int j = 0; j < documents[0].length; j++) {
-					info.append("(");
-					info.append(documents[0][j]);
-					info.append(",");
-					info.append(documents[1][j]);
-					info.append(")");
-				}				
-			}
-			return info.toString();
 		}
 	
 	
Index: src/uk/ac/gla/terrier/structures/InvertedIndexInputStream.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/InvertedIndexInputStream.java,v
retrieving revision 1.3
diff -w -u -r1.3 InvertedIndexInputStream.java
--- src/uk/ac/gla/terrier/structures/InvertedIndexInputStream.java	28 Jan 2009 20:16:55 -0000	1.3
+++ src/uk/ac/gla/terrier/structures/InvertedIndexInputStream.java	26 Feb 2009 16:11:47 -0000
@@ -27,11 +27,12 @@
 package uk.ac.gla.terrier.structures;
 
 import java.io.IOException;
+import java.util.Iterator;
+import java.util.Map;
 
 import uk.ac.gla.terrier.compression.BitIn;
 import uk.ac.gla.terrier.compression.BitInputStream;
 import uk.ac.gla.terrier.compression.OldBitInputStream;
-import uk.ac.gla.terrier.utility.ApplicationSetup;
 import uk.ac.gla.terrier.utility.FieldScore;
 
 
@@ -43,7 +44,7 @@
 public class InvertedIndexInputStream implements Closeable,LegacyBitFileStructure
 {
 	/** the lexicon input stream providing the offsets */
-	protected final LexiconInputStream lis;
+	protected final Iterator<Map.Entry<?,? extends BitIndexPointer>> lis;
 	/** The gamma compressed file containing the terms. */
 	protected BitIn file; 
 	/** filename of the underlying bitfile */
@@ -52,21 +53,10 @@
 	/** Indicates whether field information is used.*/
 	final boolean useFieldInformation = FieldScore.USE_FIELD_INFORMATION;
 	
-	public InvertedIndexInputStream(String path, String prefix, LexiconInputStream lis) throws IOException
+	public InvertedIndexInputStream(Index _index, String structureName, Iterator<Map.Entry<?, ? extends BitIndexPointer>> positions) throws IOException
 	{
-		this(path + ApplicationSetup.FILE_SEPARATOR + prefix + ApplicationSetup.IFSUFFIX, lis);
-	}
-	
-	public InvertedIndexInputStream(String filename, LexiconInputStream lis) throws IOException
-	{
-		file = new BitInputStream(this.filename = filename);
-		this.lis = lis;
-	}
-
-	public InvertedIndexInputStream(BitIn invFile, LexiconInputStream lis) throws IOException
-	{
-		file = invFile;
-		this.lis = lis;
+		file = new BitInputStream(_index.getPath() + "/" + _index.getPrefix() +"."+ structureName +".bf");
+		this.lis = positions;
 	}
 
 	/** forces the data structure to reopen the underlying bitfile
@@ -80,15 +70,17 @@
 	}
 	
 	public int[][] getNextDocuments() throws IOException {
-		int rtrLis = lis.readNextEntry();
-		if (rtrLis < 0)
+		if (! lis.hasNext())
 			return null;
-		return getNextDocuments(lis.getNt(), lis.getEndOffset(), lis.getEndBitOffset());
+		
+		return getNextDocuments(lis.next().getValue());
 	}
 	
-	protected int[][] getNextDocuments(int df, long endByteOffset, byte endBitOffset) throws IOException {
+	protected int[][] getNextDocuments(BitIndexPointer pointer) throws IOException {
 		int[][] documentTerms = null;
 		final int fieldCount = FieldScore.FIELDS_COUNT;
+		System.out.println("term"+ ((TermStatistics)pointer).getTermId() + " has Nt="+pointer.getNumberOfEntries() );
+		final int df = pointer.getNumberOfEntries();
 		if (useFieldInformation) { //if there are tag information to process			
 			documentTerms = new int[3][df];
 			documentTerms[0][0] = file.readGamma() - 1;
@@ -118,7 +110,7 @@
 		try{
 		while((documents = getNextDocuments()) != null)
 		{
-			System.out.print("tid"+i);
+			System.out.print((i++)+"th term: ");
 			if (useFieldInformation) {
 				for (int j = 0; j < documents[0].length; j++) {
 					System.out.print("(" + documents[0][j] + ", " + documents[1][j]
@@ -140,6 +132,7 @@
 	public void close()
 	{
 		try{ file.close(); } catch (IOException ioe) {}
-		lis.close();
+		if (lis instanceof Closeable)
+			((Closeable)lis).close();
 	}
 }
Index: src/uk/ac/gla/terrier/structures/Lexicon.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/Lexicon.java,v
retrieving revision 1.47
diff -w -u -r1.47 Lexicon.java
--- src/uk/ac/gla/terrier/structures/Lexicon.java	28 Jan 2009 20:16:55 -0000	1.47
+++ src/uk/ac/gla/terrier/structures/Lexicon.java	26 Feb 2009 16:11:47 -0000
@@ -1,654 +1,61 @@
-/*
- * Terrier - Terabyte Retriever 
- * Webpage: http://ir.dcs.gla.ac.uk/terrier 
- * Contact: terrier{a.}dcs.gla.ac.uk
- * University of Glasgow - Department of Computing Science
- * http://www.gla.ac.uk/
- * 
- * The contents of this file are subject to the Mozilla Public License
- * Version 1.1 (the "License"); you may not use this file except in
- * compliance with the License. You may obtain a copy of the License at
- * http://www.mozilla.org/MPL/
- *
- * Software distributed under the License is distributed on an "AS IS"
- * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
- * the License for the specific language governing rights and limitations
- * under the License.
- *
- * The Original Code is Lexicon.java.
- *
- * The Original Code is Copyright (C) 2004-2009 the University of Glasgow.
- * All Rights Reserved.
- *
- * Contributor(s):
- *   Gianni Amati <gba{a.}fub.it> (original author)
- *   Vassilis Plachouras <vassilis{a.}dcs.gla.ac.uk> 
- */
 package uk.ac.gla.terrier.structures;
-import gnu.trove.TIntObjectHashMap;
 
-import java.io.ByteArrayInputStream;
-import java.io.DataInputStream;
-import java.io.File;
-import java.io.IOException;
-import java.io.ObjectInputStream;
-import java.util.Iterator;
+import java.util.Map;
 
-import uk.ac.gla.terrier.utility.ApplicationSetup;
-import uk.ac.gla.terrier.utility.Files;
-import uk.ac.gla.terrier.utility.io.RandomDataInput;
-import uk.ac.gla.terrier.utility.io.RandomDataOutput;
-
-import org.apache.log4j.Logger;
-/**
- * The class that implements the lexicon structure. Apart from the lexicon file,
- * which contains the actual data about the terms, and takes its name from
- * ApplicationSetup.LEXICON_FILENAME, another file is created and
- * used, containing a mapping from the term's code to the offset of the term 
- * in the lexicon. The name of this file is given by 
- * ApplicationSetup.LEXICON_INDEX_FILENAME.
- * 
- * @see ApplicationSetup#LEXICON_FILENAME
- * @see ApplicationSetup#LEXICON_INDEX_FILENAME
- * @author Gianni Amati, Vassilis Plachouras
- * @version $Revision: 1.47 $
- */
-public class Lexicon implements Iterable<String>, Closeable{
-	/** The logger used for the Lexicon */
-	protected Logger logger = Logger.getRootLogger();
-	
-	/** The term represented as an array of bytes.*/
-	protected byte[] termCharacters;
-	
-	/** The term represented as a string.*/
-	protected String term;
-	
-	/** An integer representing the id of the term.*/
-	protected int termId;
-	
-	/** The document frequency of the term.*/
-	protected int documentFrequency;
-	
-	/** The term frequency of the term.*/
-	protected int termFrequency;
-	
-	/** The offset in bytes in the inverted file of the term.*/
-	protected long startOffset;
-	
-	/** The offset in bits in the starting byte in the inverted file.*/
-	protected byte startBitOffset;
-	
-	/** The offset in bytes in the inverted file of the term.*/
-	protected long endOffset;
-	
-	/** The offset in bits in the ending byte in the inverted file.*/
-	protected byte endBitOffset;
-	
-	/** 
-	 * The size in bytes of an entry in the lexicon file.
-	 * An entry corresponds to a string, an int (termCode), 
-	 * an int (docf), an int (tf), a long (the offset of the end 
-	 * of the term's entry in bytes in the inverted file) and
-	 * a byte (the offset in bits of the last byte of the term's entry 
-	 * in the inverted file.
-	 */
-	public static final int lexiconEntryLength =
-		ApplicationSetup.STRING_BYTE_LENGTH //the string representation
-		+12 //the three integers
-		+8 //the long
-		+1; //the byte
-	
-	/** The file containing the mapping from the codes to the offset in the lexicon file.*/
-	protected RandomDataInput idToOffsetFile;
-	
-	/** The actual lexicon file.*/
-	protected RandomDataInput lexiconFile;
-
-	/** Filename of the of lexicon file opened */
-	protected String lexiconFileName;
-	
-	/** The number of entries in the lexicon file.*/
-	protected int numberOfLexiconEntries;
-	
-	/** A buffer for reading from the lexicon file.*/
-	protected byte[] buffer = new byte[512];
-	
-	/** A second buffer for finding terms.*/
-	protected byte[] bt = new byte[ApplicationSetup.STRING_BYTE_LENGTH];
-	
-	/** A byte input stream to read from the buffer.*/
-	protected ByteArrayInputStream bufferInput = new ByteArrayInputStream(buffer);
-	
-	/** A data input stream to read from the bufferInput.*/
-	protected DataInputStream dataInput = new DataInputStream(bufferInput);
-	
-	/** 
-	 * A hashmap that is used in order to reduce the number 
-	 * of random accesses on disk during the binary search
-	 */
-	protected TIntObjectHashMap map = null;
-
-	/** Controls whether to use the hash for speeding up 
-	 * lexicon entry lookups or not. The corresponding
-	 * property is <tt>lexicon.use.hash</tt>.
-	 */
-	protected boolean USE_HASH = Boolean.parseBoolean(ApplicationSetup.getProperty("lexicon.use.hash","true"));
-
-	protected Class inputStreamClass = LexiconInputStream.class;
-	
-	/** Contructor for child classes which dont want to open a file */
-	protected Lexicon(long a, long b, long c) {}
-	
-	/** 
-	 * A default constructor.
-	 */
-	public Lexicon() {
-		this(ApplicationSetup.LEXICON_FILENAME);
-	}
-
-	public Lexicon(String path, String prefix)
+public abstract class Lexicon<KEY> implements Closeable, Iterable<Map.Entry<KEY,LexiconEntry>>
 	{
-		this(path + ApplicationSetup.FILE_SEPARATOR + prefix + ApplicationSetup.LEXICONSUFFIX);
-	}
-	
-	/**
-	 * Constructs an instace of Lexicon and opens
-	 * the corresponding file.
-	 * 
-	 * @param lexiconName the name of the lexicon file.
-	 */
-	public Lexicon(String lexiconName) {
-		boolean updateable = false;
-		try {
-			lexiconFile = updateable
-				? Files.writeFileRandom(this.lexiconFileName = lexiconName)
-				: Files.openFileRandom(this.lexiconFileName = lexiconName);
-			idToOffsetFile = Files.openFileRandom(lexiconName.substring(0,lexiconName.lastIndexOf(".")).concat(ApplicationSetup.LEXICON_INDEX_SUFFIX));
-			numberOfLexiconEntries = (int) (lexiconFile.length() / (long)lexiconEntryLength);
-			
-			if (USE_HASH) {
-				try{
-					String hashFilename = lexiconName.substring(0,lexiconName.lastIndexOf(".")).concat(ApplicationSetup.LEXICON_HASH_SUFFIX);
-					ObjectInputStream ois = new ObjectInputStream(Files.openFileStream(hashFilename));
-					map = (TIntObjectHashMap)ois.readObject();
-					ois.close();
-				}
-				catch (IOException ioe) {
-					logger.warn("Input/output exception while reading the hashmap used for the lexicon. Hash will not be used." + ioe);
-					USE_HASH = false;
-				} catch (ClassNotFoundException cnfe) {
-					logger.warn("ClassNotFoundException while reading the hashmap used for the lexicon. Hash will not be used." + cnfe);
-					USE_HASH = false;
-				}
-			}//USE_HASH
-		} catch (IOException ioe) {
-			logger.error("Input/output exception while opening for reading the lexicon file: " + ioe);
-		}
-
-	}
-	
-	/**
-	* Closes the lexicon and lexicon index files.
-	*/
-	public void close() {
-		try {
-			idToOffsetFile.close();
-			lexiconFile.close();
-		} catch (IOException ioe) {
-			logger.error("Input/output exception while closing the lexicon file: " + ioe);
-		}
-	}
-	
-	/** 
-	 * Prints out the contents of the lexicon file. 
-	 * Streams are used to read the lexicon file.
-	 */
-	public void print() {
-		LexiconInputStream tmp=null;
-		try{
-			tmp = (LexiconInputStream)inputStreamClass.getConstructor(String.class).newInstance(this.lexiconFileName);
-		} catch (Exception e) {logger.error(e); return;}
-		final LexiconInputStream _lis=tmp;
-		_lis.print();
-	}
-
-	/**
-	 * Finds the term given its term code.
-	 *
-	 * @return true if the term is found, else return false
-	 * @param _termId the term's identifier
-	 */
-	public boolean findTerm(int _termId) {
-		try {
-			idToOffsetFile.seek((long)_termId * 8L);
-			return seekEntry((int) (idToOffsetFile.readLong()/(long)lexiconEntryLength));
-		} catch(IOException ioe) {
-			logger.error("Input/Output exception while reading the lexicon index file for termid "+_termId+": ", ioe);
-		}
-		return false;
-		
-	}
-	/** 
-	 * Performs a binary search in the lexicon
-	 * in order to locate the given term.
-	 * If the term is located, the properties
-	 * termCharacters, documentFrequency,
-	 * termFrequency, startOffset, startBitOffset,
-	 * endOffset and endBitOffset contain the
-	 * values related to the term.
-	 * @param _term The term to search for.
-	 * @return true if the term is found, and false otherwise.
-	 */
-	public boolean findTerm(String _term) {
-		int low = -1;
-		int high = numberOfLexiconEntries;
-		int i;
-		int compareStrings;
-
-		if (USE_HASH) {
-			int firstChar = _term.charAt(0);
-			int[] boundaries = (int[])map.get(firstChar);
-			low = boundaries[0];
-			high = boundaries[1];
-
-		}
-
-		//if (logger.isDebugEnabled()) 
-		//	logger.debug("lexicon hash low high for term " + _term + " are: " + low + " " + high);
-		
-		try {
-			while (high-low>1) {
-				
-				i = (high + low)/2;
-				
-				lexiconFile.seek((long)i * (long)lexiconEntryLength);
-				lexiconFile.readFully(buffer, 0, lexiconEntryLength);
-				term = new String(buffer,0,ApplicationSetup.STRING_BYTE_LENGTH).trim();
-							
-				if ((compareStrings = _term.compareTo(term))< 0)
-					high = i;
-				else if (compareStrings > 0)
-					low = i;
-				else { 
-					seekEntry(i);
-					return true;
-				}
-					
-			
-			}
-		} catch(IOException ioe) {
-			logger.fatal("IOException while binary searching the lexicon: " + ioe);
-		}
-		
-		if (high == numberOfLexiconEntries)
-			return false;
-		
-		seekEntry(high);
-		if (_term.compareTo(term) == 0) 
-			return true; 
-		return false;
-	}
-
-	/**
-	 * Returns the bit offset in the last byte of 
-	 * the term's entry in the inverted file.
-	 * @deprecated
-	 * @return byte the bit offset in the last byte of 
-	 *		 the term's entry in the inverted file
-	 */
-	public byte getEndBitOffset() {
-		return endBitOffset;
-	}
-	/**
-	 * Returns the ending offset of the term's entry in the inverted file.
-	 * @deprecated
-	 * @return long The ending byte of the term's entry in the inverted file.
-	 */
-	public long getEndOffset() {
-		return endOffset;
-	}
-	/**
-	 * Return the document frequency for the given term.
-	 * @deprecated
-	 * @return int The document frequency for the given term
-	 */
-	public int getNt() {
-		return documentFrequency;
-	}
-	/**
-	 * Returns the number of entries in the lexicon.
-	 * @return the number of entries in the lexicon.
-	 * @deprecated
-	 */
-	public long getNumberOfLexiconEntries() {
-		return numberOfLexiconEntries;
-	}
-	/**
-	 * The bit offset in the starting byte of 
-	 * the entry in the inverted file.
-	 * @deprecated
-	 * @return byte The number of bits in the first 
-	 *		 byte of the entry in the inverted file
-	 */
-	public byte getStartBitOffset() {
-		return startBitOffset;
-	}
-	/**
-	 * Returns the beginning of the term's entry in the inverted file.
-	 * @deprecated
-	 * @return long the start offset (in bytes) in the inverted file
-	 */
-	public long getStartOffset() {
-		return startOffset;
-	}
-	/**
-	 * Insert the method's description here.
-	 * @deprecated
-	 * @return java.lang.String The string representation of the seeked term.
-	 */
-	public String getTerm() {
-		return this.term.trim();
-	}
-	/**
-	 * Returns the term's id.
-	 * @deprecated
-	 * @return int the term's id.
-	 */
-	public int getTermId() {
-		return termId;
-	}
-	/**
-	 * Returns the term frequency for the already seeked term.
-	 *
-	 * @return int The term frequency in the collection.
-	 * @deprecated
-	 */
-	public int getTF() {
-		return termFrequency;
-	}
-	/**
-	 * Seeks the i-th entry of the lexicon.
-	 * TODO read a byte array from the file and decode it, 
-	 * 		instead of reading the different pieces of 
-	 *	  information separately.
-	 * @param i The index of the entry we are looking for.
-	 * @return true if the entry was found, false otherwise.
-	 */
-	public boolean seekEntry(int i) {
-		try {
-			if (i >= numberOfLexiconEntries || i < 0)
-				return false;
-			else {
-				if (i == 0) {
-					lexiconFile.seek(0);
-					startOffset = 0;
-					startBitOffset = 0;
-					lexiconFile.readFully(buffer, 0, lexiconEntryLength);
-					dataInput.reset();
-					term = new String(buffer,0,ApplicationSetup.STRING_BYTE_LENGTH).trim();
-				} else {
-					lexiconFile.seek((i-1) * (long)lexiconEntryLength + (long)(ApplicationSetup.STRING_BYTE_LENGTH + 12));
-					lexiconFile.readFully(buffer, 0, lexiconEntryLength + 9);
-					dataInput.reset();
-					startOffset = dataInput.readLong();
-					startBitOffset = dataInput.readByte();
-					if (++startBitOffset == 8) {
-						startBitOffset = 0;
-						startOffset++;
-					}
-					term = new String(buffer, 9, ApplicationSetup.STRING_BYTE_LENGTH).trim();					
-				}
-				dataInput.skipBytes(ApplicationSetup.STRING_BYTE_LENGTH);
-				termId = dataInput.readInt();
-				documentFrequency = dataInput.readInt();
-				termFrequency = dataInput.readInt();
-				endOffset = dataInput.readLong();
-				endBitOffset = dataInput.readByte();
-				return true;
-			}
-		} catch (IOException ioe) {
-			logger.error("Input/Output exception while reading the idToOffset file. ", ioe);
-		}
-		return false;
-	}
-
-	
-	/**
-	 * In an already stored entry in the lexicon
-	 * file, the information about the term frequency,
-	 * the endOffset in bytes, and the endBitOffset in the last
-	 * byte, is updated. The term is specified by the index of the entry.
-	 *
-	 * @return true if the information is updated properly, 
-	 *		 otherwise return false
-	 * @param i the i-th entry
-	 * @param frequency the term's Frequency
-	 * @param endOffset the offset of the ending byte in the inverted file
-	 * @param endBitOffset the offset in bits in the ending byte 
-	 *		in the term's entry in inverted file
-	 * @deprecated The Lexicon class is only used for reading the
-	 *			 lexicon file, and not for writing any information.
-	 */
-	public boolean updateEntry(
-		int i,
-		int frequency,
-		long endOffset,
-		byte endBitOffset) {
-		
-		if (! (lexiconFile instanceof RandomDataOutput))
-			return false;
-		RandomDataOutput _lexiconFile = (RandomDataOutput)lexiconFile;
-		try {
-			long lexiconOffset = (long)i * (long)lexiconEntryLength;
-			//we seek the offset where the frequency should be writen
-			_lexiconFile.seek(
-				lexiconOffset + ApplicationSetup.STRING_BYTE_LENGTH + 8);
-			_lexiconFile.writeInt(frequency);
-			_lexiconFile.writeLong(endOffset);
-			_lexiconFile.writeByte(endBitOffset);
-		} catch (IOException ioe) {
-			logger.error("Input/Output exception while writing to the lexicon file. ", ioe);
-		}
-		return false;
-	}
-
-
-	/** Returns the number of entries in the lexicon file specified by f.
-	  * @param f The file to find the number of entries in
-	  */
-	public static int numberOfEntries(File f) {
-		return (int) ( f.length()/(long)lexiconEntryLength );
-	}
-
-	/** Returns the number of entries in the lexicon file specified by filename.
-	  * @param filename
-	  */
-	public static int numberOfEntries(String filename) {
-		return numberOfEntries(new File(filename));
-	}
-
-	
-	/** Returns a LexiconEntry describing all the information in the lexicon about the ith term 
-	 * in the lexicon.
-	 * @param termNumber The ith term in the lexicon. i is 0-based, and runs to getNumberOfLexiconEntries()-1
-	 * @return LexiconEntry all information about the term's entry in the lexicon. null if termid not found
-	 */
-	public LexiconEntry getIthLexiconEntry(int termNumber) {
-		if (! seekEntry(termNumber))
-			return null;
-		LexiconEntry le = new LexiconEntry();
-		le.termId = this.termId;
-		le.term = this.term.trim();
-		le.TF = this.termFrequency;
-		le.n_t = this.documentFrequency;
-		le.startOffset = this.startOffset;
-		le.startBitOffset = this.startBitOffset;
-		le.endOffset = this.endOffset;
-		le.endBitOffset = this.endBitOffset;
-		return le;
-	}
-	
-	/** Returns a LexiconEntry describing all the information in the lexicon about the term
-	  * denoted by termid
-	  * @param termid the termid of the term of interest
-	  * @return LexiconEntry all information about the term's entry in the lexicon. null if termid not found */
-	public LexiconEntry getLexiconEntry(int termid) {
-		/* TODO: improve this to the effectiveness level of getLexiconEntry() */
-		if (! findTerm(termid))
-			return null;
-		LexiconEntry le = new LexiconEntry();
-		le.termId = this.termId;
-		le.term = this.term.trim();
-		le.TF = this.termFrequency;
-		le.n_t = this.documentFrequency;
-		le.startOffset = this.startOffset;
-		le.startBitOffset = this.startBitOffset;
-		le.endOffset = this.endOffset;
-		le.endBitOffset = this.endBitOffset;
-		return le;
-	}
-	
-	/** Returns a LexiconEntry describing all the information in the lexicon about the term
-	  * denoted by _term
-	  * @param _term the String term that is of interest
-	  * @return LexiconEntry all information about the term's entry in the lexicon. null if termid not found */
-	public LexiconEntry getLexiconEntry(String _term) {
-		int low = -1;
-		int high = numberOfLexiconEntries;
-		int i;
-		int compareStrings;
-		String term;
-		byte[] buffer = new byte[lexiconEntryLength+9]; //to get the start offsets as well
-		
-		if (USE_HASH) {
-			int firstChar = _term.charAt(0);
-			int[] boundaries = (int[])map.get(firstChar);
-			if (boundaries != null)
+    static class LexiconFileEntry<KEY2> implements Map.Entry<KEY2,LexiconEntry>
 			{
-				low = boundaries[0];
-				high = boundaries[1];
-			}
-			//System.out.println("lexicon use hash: " + low + " " + high);
-		}
-		
-		try {
-			while (high-low>1) {
+        KEY2 key;
+        LexiconEntry value;
 				
-				i = (high + low)/2;
-				if (i==0) {
-					lexiconFile.seek(0);
-					lexiconFile.readFully(buffer, 0, lexiconEntryLength);
-					term = new String(buffer,0,ApplicationSetup.STRING_BYTE_LENGTH).trim();
-				} else {
-					lexiconFile.seek((long)i * (long)(lexiconEntryLength)-9L);
-					lexiconFile.readFully(buffer, 0, lexiconEntryLength+9);
-					term = new String(buffer,9,ApplicationSetup.STRING_BYTE_LENGTH).trim();
-				}
-							
-				if ((compareStrings = _term.compareTo(term))< 0)
-					high = i;
-				else if (compareStrings > 0)
-					low = i;
-				else { //read the rest and return the data
-					return getLexiconEntryFromBuffer(buffer, term, i);
-				}
-			}
-		
-			if (high == numberOfLexiconEntries)
-				return null;
-			
-			if (high == 0) {
-				lexiconFile.seek(0);
-				lexiconFile.readFully(buffer, 0, lexiconEntryLength);
-				term = new String(buffer,0,ApplicationSetup.STRING_BYTE_LENGTH).trim();
-			} else {
-				lexiconFile.seek((long)high * (long)(lexiconEntryLength)-9L);
-				lexiconFile.readFully(buffer, 0, lexiconEntryLength+9);
-				term = new String(buffer,9,ApplicationSetup.STRING_BYTE_LENGTH).trim();				
+        public LexiconFileEntry(KEY2 k, LexiconEntry v)
+        {
+            this.key = k;
+            this.value = v;
 			}
 			
-			if (_term.compareTo(term) == 0) {
-				return getLexiconEntryFromBuffer(buffer, term, high);
-			}	
-		} catch(IOException ioe) {
-			logger.fatal("IOException while binary searching the lexicon: " + ioe);
-		}
-		return null;
+        public int hashCode()
+        {
+            LexiconFileEntry e = this;
+            return (e.getKey()==null   ? 0 : e.getKey().hashCode()) ^
+             (e.getValue()==null ? 0 : e.getValue().hashCode());
 	}
 
-	protected LexiconEntry getLexiconEntryFromBuffer(byte[] buffer, String term, int index) {
-		int offset;
-		LexiconEntry lEntry = new LexiconEntry();
-		lEntry.term = term;
-		if (index==0) {
-			lEntry.startOffset = 0;
-			lEntry.startBitOffset = 0;
-			offset = ApplicationSetup.STRING_BYTE_LENGTH;						
-		} else {
-			offset = 0;
-//			lEntry.startOffset =
-//				(((((((buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 |
-//					   buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff;
-
-			long startOffset = (buffer[offset++] & 0xff);
-			for (int j=0; j<7; j++)
-				startOffset = startOffset<<8 | (buffer[offset++] & 0xff);
-			lEntry.startOffset = startOffset;
-
-			
-			lEntry.startBitOffset = (byte)(buffer[offset++]&0xff);
-			if (++lEntry.startBitOffset == 8) {
-				lEntry.startBitOffset = 0;
-				lEntry.startOffset++;
+        public LexiconEntry setValue(LexiconEntry v)
+        {
+        	LexiconEntry old = value;
+            value = v;
+            return old;
 			}
 
-			offset += ApplicationSetup.STRING_BYTE_LENGTH;
+        public KEY2 getKey()
+        {
+            return key;
 		}
-		lEntry.termId = 
-			(((buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff;
-		lEntry.n_t =
-			(((buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff;
-		lEntry.TF =
-			(((buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff;
-		
-//		lEntry.endOffset = 
-//			(((((((buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 |
-//				   buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff) << 8 | buffer[offset++]&0xff;
 
-		long endOffset = (int)(buffer[offset++] & 0xff);
-		for (int j=0; j<7; j++)
-			endOffset = endOffset<<8 | (buffer[offset++] & 0xff);
-		lEntry.endOffset = endOffset;
-		
-		lEntry.endBitOffset = (byte)(buffer[offset]&0xff);
-		return lEntry;
+        public LexiconEntry getValue()
+        {
+            return value;    
 	}
 
-	/** Returns an interator that gives every item in the lexicon, in lexical order. Underlying implementation is
-	  * using a lexicon input stream */
-	public Iterator<String> iterator()
+        @SuppressWarnings("unchecked")
+		public boolean equals(Object o)
 	{
-		LexiconInputStream tmp=null;
-		try{
-			tmp = (LexiconInputStream)inputStreamClass.getConstructor(String.class).newInstance(this.lexiconFileName);
-		} catch (Exception e) {logger.error(e);}
-		final LexiconInputStream _lis=tmp;
-		return new Iterator<String>(){
-			LexiconInputStream lis = _lis;
-			 public boolean hasNext(){
-				try{
-					return lis.readNextEntry() != -1;
-				} catch (IOException ioe) {
-					logger.error(ioe);
+            if (! (o instanceof Map.Entry))
 					return false;
-				}
-			}
-			public String next()
-			{
-				return lis.getTerm();
-			}
-			public void remove() { throw new UnsupportedOperationException();}
-		};
+            LexiconFileEntry e1 = this;
+            Map.Entry<String,LexiconEntry> e2 = (Map.Entry)o;
+            return (e1.getKey()==null ?
+              e2.getKey()==null : e1.getKey().equals(e2.getKey()))  &&
+             (e1.getValue()==null ?
+              e2.getValue()==null : e1.getValue().equals(e2.getValue()));
 	}
 }
 
+    public abstract int numberOfEntries();
+    public abstract LexiconEntry getLexiconEntry(KEY term);
+    public abstract Map.Entry<KEY,LexiconEntry> getLexiconEntry(int termid);
+    public abstract Map.Entry<KEY,LexiconEntry> getIthLexiconEntry(int index);
+    public abstract void close();
+}
\ No newline at end of file
Index: src/uk/ac/gla/terrier/structures/LexiconEntry.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/LexiconEntry.java,v
retrieving revision 1.5
diff -w -u -r1.5 LexiconEntry.java
--- src/uk/ac/gla/terrier/structures/LexiconEntry.java	28 Jan 2009 20:16:55 -0000	1.5
+++ src/uk/ac/gla/terrier/structures/LexiconEntry.java	26 Feb 2009 16:11:47 -0000
@@ -1,86 +1,15 @@
-
-/*
- * Terrier - Terabyte Retriever 
- * Webpage: http://ir.dcs.gla.ac.uk/terrier 
- * Contact: terrier{a.}dcs.gla.ac.uk
- * University of Glasgow - Department of Computing Science
- * http://www.gla.ac.uk/
- * 
- * The contents of this file are subject to the Mozilla Public License
- * Version 1.1 (the "License"); you may not use this file except in
- * compliance with the License. You may obtain a copy of the License at
- * http://www.mozilla.org/MPL/
- *
- * Software distributed under the License is distributed on an "AS IS"
- * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
- * the License for the specific language governing rights and limitations
- * under the License.
- *
- * The Original Code is BlockDirectIndex.java.
- *
- * The Original Code is Copyright (C) 2004-2009 the University of Glasgow.
- * All Rights Reserved.
- *
- * Contributor(s):
- *   Vassilis Plachouras <vassilis{a.}dcs.gla.ac.uk> (original author)
- *   Craig Macdonald <craigm{a.}dcs.gla.ac.uk>
- */
 package uk.ac.gla.terrier.structures;
+import org.apache.hadoop.io.Writable;
 
-/** Contains all the information about one entry in the Lexicon. 
-  * Created to make thread-safe lookups in the Lexicon easier. */
-public class LexiconEntry {
-
-	/** Create an empty LexiconEntry */
-	public LexiconEntry(){}
-
-	/** Create a lexicon entry with the following information.
-	  * @param t the term 
-	  * @param tid the term id
-	  * @param n_t the number of documents the term occurs in (document frequency)
-	  * @param TF the total count of therm t in the collection
-	  */
-	public LexiconEntry(String t, int tid, int n_t, int TF)
+public abstract class LexiconEntry implements TermStatistics, BitIndexPointer, Writable
 	{
-		this.term =t;
-		this.termId = tid;
-		this.n_t = n_t;
-		this.TF = TF;
-	}
 
-	/** increment this lexicon entry by another */
-	public void add(LexiconEntry le)
+    public String toString()
 	{
-		this.n_t += le.n_t;
-		this.TF  += le.TF;
+        return '('+getDocumentFrequency()+","+getFrequency()+')'
+            +'@'+getBytes() + ',' + getBits();
 	}
 
-	/** alter this lexicon entry to subtract another lexicon entry */
-	public void subtract(LexiconEntry le)
-	{
-		this.n_t -= le.n_t;
-		this.TF  -= le.TF;
-	}
-
-	/** the term of this entry */	
-	public String term;
-	/** the termid of this entry */
-	public int termId;
-	/** the number of document that this entry occurs in */
-	public int n_t;
-	/** the total number of occurrences of the term in the index */
-	public int TF;
-	/** the start offset of the entry in the inverted index */
-	public long startOffset;
-	/** the start bit offset of the entry in the inverted index */
-	public byte startBitOffset;
-	/** the end offset of the entry in the inverted index */
-	public long endOffset;
-	/** the end bit offset of the entry in the inverted index */
-	public byte endBitOffset;
-
-	/** returns a string representation of this lexicon entry */	
-	public String toString() {
-		return term + " " + termId + " " + n_t + " " + TF + " " + startOffset + " " + startBitOffset + " " + endOffset + " " + endBitOffset;
-	}
+    public abstract void add(LexiconEntry le);
+    public abstract void setTermId(int newTermId);
 }
Index: src/uk/ac/gla/terrier/structures/LexiconOutputStream.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/LexiconOutputStream.java,v
retrieving revision 1.29
diff -w -u -r1.29 LexiconOutputStream.java
--- src/uk/ac/gla/terrier/structures/LexiconOutputStream.java	28 Jan 2009 20:16:55 -0000	1.29
+++ src/uk/ac/gla/terrier/structures/LexiconOutputStream.java	26 Feb 2009 16:11:47 -0000
@@ -25,40 +25,14 @@
  */
 package uk.ac.gla.terrier.structures;
 import java.io.DataOutput;
-import java.io.DataOutputStream;
-import java.io.File;
 import java.io.IOException;
-
-import org.apache.log4j.Logger;
-
-import uk.ac.gla.terrier.utility.ApplicationSetup;
-import uk.ac.gla.terrier.utility.Files;
 /**
  * This class implements an output stream for the lexicon structure.
- * @author Vassilis Plachouras
+ * @author Vassilis Plachouras &amp; Craig Macdonald
  * @version $Revision: 1.29 $
  */
-public class LexiconOutputStream implements Closeable {
-	/** The logger used */
-	private static Logger logger = Logger.getRootLogger();
-	/** A zero buffer for writing to the file.*/
-	protected final byte[] zeroBuffer =
-		new byte[ApplicationSetup.STRING_BYTE_LENGTH];
-	/** The term represented as an array of bytes.*/
-	protected final byte[] termCharacters =
-		new byte[ApplicationSetup.STRING_BYTE_LENGTH];
-	/** The term represented as a string.*/
-	protected String term;
-	/** An integer representing the id of the term.*/
-	protected int termId;
-	/** The document frequency of the term.*/
-	protected int documentFrequency;
-	/** The term frequency of the term.*/
-	protected int termFrequency;
-	/** The offset in bytes in the inverted file of the term.*/
-	protected long endOffset;
-	/** The offset in bits in the starting byte in the inverted file.*/
-	protected byte endBitOffset;
+public abstract class LexiconOutputStream<KEY> implements Closeable {
+	
 	/** A data input stream to read from the bufferInput.*/
 	protected DataOutput lexiconStream = null;
 	/** Pointer written - the sum of the Nts */
@@ -67,57 +41,7 @@
 	protected long numTokensWritten = 0;
 	protected int numTermsWritten = 0;
 
-	 /** A constructor for child classes that doesnt open the file */
-	protected LexiconOutputStream(long a, long b, long c) { }
-
-	/**
-	 * A default constructor.
-	 */
-	public LexiconOutputStream() {
-		try {
-			lexiconStream = new DataOutputStream(Files.writeFileStream(ApplicationSetup.LEXICON_FILENAME));
-		} catch (IOException ioe) {
-			logger.fatal(
-				"I/O error occured while opening the lexicon file. Stack trace follows.",ioe);
-		}
-	}
-	/** Create a lexicon using the specified data stream */
-	public LexiconOutputStream(DataOutput out){
-		lexiconStream = out;
-	}
-	
-	/**
-	 * A constructor given the filename.
-	 * @param filename java.lang.String the name of the lexicon file.
-	 */
-	public LexiconOutputStream(String filename) {
-		try {
-			lexiconStream = new DataOutputStream(Files.writeFileStream(filename));
-		} catch (IOException ioe) {
-			logger.fatal(
-				"I/O error occured while opening the lexicon file. Stack trace follows.",ioe);
-		}
-	}
-	/**
-	 * A constructor given the filename.
-	 * @param file java.io.File the name of the lexicon file.
-	 */
-	public LexiconOutputStream(File file) {
-		try {
-			lexiconStream = new DataOutputStream(Files.writeFileStream(file));
-		} catch (IOException ioe) {
-			logger.fatal(
-				"I/O error occured while opening the lexicon file. Stack trace follows.",ioe);
-		}
-	}
-
-	/** A constructor for a LexiconOutputStream given the index path and prefix
-	  * @param path String the path to the index
-	  * @param prefix String the prefix of the filenames in the index
-	  */
-	public LexiconOutputStream(String path, String prefix) {
-		this(path + ApplicationSetup.FILE_SEPARATOR + prefix + ApplicationSetup.LEXICONSUFFIX);
-	}
+	protected LexiconOutputStream() { }
 
 
 	/**
@@ -134,75 +58,16 @@
 	 * Writes a lexicon entry.
 	 * @return the number of bytes written to the file. 
 	 * @throws java.io.IOException if an I/O error occurs
-	 * @param _term the string representation of the term
-	 * @param _termId the terms integer identifier
-	 * @param _documentFrequency the term's document frequency in the collection
-	 * @param _termFrequency the term's frequency in the collection
-	 * @param _endOffset the term's ending byte offset in the inverted file
-	 * @param _endBitOffset the term's ending byte bit-offset in the inverted file
+	 * @param _key the key - usually the term
+	 * @param _value the lexicon entry value
 	 */
-	public int writeNextEntry(
-		String _term,
-		int _termId,
-		int _documentFrequency,
-		int _termFrequency,
-		long _endOffset,
-		byte _endBitOffset)
-		throws IOException {
-		byte[] tmpBytes = _term.getBytes();
-		final int length = tmpBytes.length;
-		numPointersWritten += _documentFrequency;
-		numTokensWritten += _termFrequency;
-		numTermsWritten++;
-		lexiconStream.write(tmpBytes, 0, length);
-		/* if an ArrayIndexOutOfBoundsException ocurrs here
-		 * this means that the term is longer than STRING_BYTE_LENGTH */
-		lexiconStream.write(
-			zeroBuffer,
-			0,
-			ApplicationSetup.STRING_BYTE_LENGTH - length);
-		lexiconStream.writeInt(_termId);
-		lexiconStream.writeInt(_documentFrequency);
-		lexiconStream.writeInt(_termFrequency);
-		lexiconStream.writeLong(_endOffset);
-		lexiconStream.writeByte(_endBitOffset);
-		return Lexicon.lexiconEntryLength;
-	}
-	/**
-	 * Writes a lexicon entry.
-	 * @return the number of bytes written.
-	 * @throws java.io.IOException if an I/O error occurs
-	 * @param _term the byte[] representation of the term. Using this format means that
-	 * the term does not have to be decoded and recoded every time.
-	 * @param _termId the terms integer identifier
-	 * @param _documentFrequency the term's document frequency in the collection
-	 * @param _termFrequency the term's frequency in the collection
-	 * @param _endOffset the term's ending byte offset in the inverted file
-	 * @param _endBitOffset the term's ending byte bit-offset in the inverted file
-	 */
-	public int writeNextEntry(
-		byte[] _term,
-		int _termId,
-		int _documentFrequency,
-		int _termFrequency,
-		long _endOffset,
-		byte _endBitOffset)
-		throws IOException {
-		final int length = _term.length;
-		numPointersWritten += _documentFrequency;
-		numTokensWritten += _termFrequency;
+	public abstract int writeNextEntry(KEY _key, LexiconEntry _value) throws IOException;
+	
+	protected void incrementCounters(TermStatistics t)
+	{
 		numTermsWritten++;
-		lexiconStream.write(_term, 0, _term.length);
-		lexiconStream.write(
-			zeroBuffer,
-			0,
-		   	ApplicationSetup.STRING_BYTE_LENGTH - length);
-		lexiconStream.writeInt(_termId);
-		lexiconStream.writeInt(_documentFrequency);
-		lexiconStream.writeInt(_termFrequency);
-		lexiconStream.writeLong(_endOffset);
-		lexiconStream.writeByte(_endBitOffset);
-		return Lexicon.lexiconEntryLength;
+		numPointersWritten += t.getDocumentFrequency();
+		numTokensWritten += t.getFrequency();
 	}
 
 	/** Returns the number of pointers there would be in an inverted index built using this lexicon (thus far).
@@ -224,56 +89,4 @@
 	{
 		return numTermsWritten;
 	}
-
-	/**
-	 * Sets the bit offset in the last byte of the term's entry in the inverted file.
-	 * @param _endBitOffset byte the bit offset in the last byte of the 
-	 *		term's entry in the inverted file.
-	 * @deprecated
-	 */
-	public void setEndBitOffset(byte _endBitOffset) {
-		endBitOffset = _endBitOffset;
-	}
-	/**
-	 * Sets the ending offset of the term's entry in the inverted file.
-	 * @param _endOffset long The ending byte of the term's 
-	 *		entry in the inverted file.
-	 * @deprecated
-	 */
-	public void setEndOffset(long _endOffset) {
-		endOffset = _endOffset;
-	}
-	/**
-	 * Sets the document frequency for the given term.
-	 * @param _Nt int The document frequency for the given term.
-	 * @deprecated
-	 */
-	public void setNt(int _Nt) {
-		documentFrequency = _Nt;
-	}
-	/**
-	 * Sets the string representation of the term.
-	 * @param _term java.lang.String The string representation of 
-	 *		the seeked term.
-	 * @deprecated
-	 */
-	public void setTerm(String _term) {
-		term = _term;
-	}
-	/**
-	 * Sets the term's id.
-	 * @param _termId int the term's identifier.
-	 * @deprecated
-	 */
-	public void setTermId(int _termId) {
-		termId = _termId;
-	}
-	/**
-	 * Sets the term frequency for the already found term.
-	 * @param _termFrequency int The term frequency in the collection.
- 	 * @deprecated
-	 */
-	public void setTF(int _termFrequency) {
-		termFrequency = _termFrequency;
-	}
 }
Index: src/uk/ac/gla/terrier/structures/indexing/BlockInvertedIndexBuilder.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/indexing/BlockInvertedIndexBuilder.java,v
retrieving revision 1.40
diff -w -u -r1.40 BlockInvertedIndexBuilder.java
--- src/uk/ac/gla/terrier/structures/indexing/BlockInvertedIndexBuilder.java	28 Jan 2009 20:16:57 -0000	1.40
+++ src/uk/ac/gla/terrier/structures/indexing/BlockInvertedIndexBuilder.java	26 Feb 2009 16:11:47 -0000
@@ -34,15 +34,18 @@
 import java.io.DataOutputStream;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.Map;
 
 import org.apache.log4j.Logger;
 
 import uk.ac.gla.terrier.structures.BlockDirectIndexInputStream;
-import uk.ac.gla.terrier.structures.BlockLexiconInputStream;
-import uk.ac.gla.terrier.structures.DocumentIndex;
+import uk.ac.gla.terrier.structures.BlockTermStatistics;
+import uk.ac.gla.terrier.structures.Closeable;
 import uk.ac.gla.terrier.structures.Index;
-import uk.ac.gla.terrier.structures.LexiconInputStream;
+import uk.ac.gla.terrier.structures.LexiconEntry;
 import uk.ac.gla.terrier.structures.LexiconOutputStream;
+import uk.ac.gla.terrier.structures.MapFileLexicon;
 import uk.ac.gla.terrier.utility.ApplicationSetup;
 import uk.ac.gla.terrier.utility.FieldScore;
 import uk.ac.gla.terrier.utility.Files;
@@ -99,42 +102,9 @@
 
 	private static Logger logger = Logger.getRootLogger();
 	protected String finalLexiconClass = "uk.ac.gla.terrier.structures.Lexicon";
-	protected String finalLexiconInputStreamClass = "uk.ac.gla.terrier.structures.LexiconInputStream";
 
-	/**
-	 * Creates an instance of the BlockInvertedIndex class.
-	 * @deprecated
-	 */
-	public BlockInvertedIndexBuilder() {
-		this(ApplicationSetup.TERRIER_INDEX_PATH,
-				ApplicationSetup.TERRIER_INDEX_PREFIX);
-	}
-
-	/**
-	 * Creates an instance of the BlockInvertedIndex class using the given
-	 * filename.
-	 * 
-	 * @param filename
-	 *			the name of the inverted file
-	 * @deprecated use this() or this(String, String) instead
-	 */
-	public BlockInvertedIndexBuilder(String filename) {
-		super(filename);
-		lexiconInputStream = BlockLexiconInputStream.class;
-		lexiconOutputStream = LexiconOutputStream.class;
-	}
-	/**
-	 * @deprecated
-	 */
-	public BlockInvertedIndexBuilder(String path, String prefix) {
-		super(path, prefix);
-		lexiconInputStream = BlockLexiconInputStream.class;
-		lexiconOutputStream = LexiconOutputStream.class;
-	}
-
-	public BlockInvertedIndexBuilder(Index index) {
-		super(index);
-		lexiconInputStream = BlockLexiconInputStream.class;
+	public BlockInvertedIndexBuilder(Index index, String structureName) {
+		super(index, structureName);
 		lexiconOutputStream = LexiconOutputStream.class;
 	}
 
@@ -145,26 +115,23 @@
 	 * need to read the direct file is related to the parameter M, and
 	 * consequently to the size of the available memory.
 	 */
+	@SuppressWarnings("unchecked")
 	public void createInvertedIndex() {
 		numberOfPointersPerIteration = Integer.parseInt(ApplicationSetup.getProperty("invertedfile.processpointers", "2000000")); 
 		processTerms = Integer.parseInt(ApplicationSetup.getProperty("invertedfile.processterms", "25000"));
 		try {
 			Runtime r = Runtime.getRuntime();
 			logger.info("creating block inverted index");
-			final String LexiconFilename = indexPathPrefix
-					+ ApplicationSetup.LEXICONSUFFIX;
-			final String DocumentIndexFilename = indexPathPrefix
-					+ ApplicationSetup.DOC_INDEX_SUFFIX;
-			DocumentIndex docIndex = new DocumentIndex(DocumentIndexFilename);
-			final int numberOfDocuments = docIndex.getNumberOfDocuments();
-			docIndex.close();
+			final String LexiconFilename = index.getPath() + "/" + index.getPrefix() + ".lexicon";
+			final int numberOfDocuments = index.getCollectionStatistics().getNumberOfDocuments();
 
 			long assumedNumberOfPointers = Long.parseLong(index.getIndexProperty("num.Pointers", "0"));
 			long numberOfTokens = 0;
 			long numberOfPointers = 0;
 
-			BlockLexiconInputStream lexiconStream = (BlockLexiconInputStream) getLexInputStream(LexiconFilename);
-			numberOfUniqueTerms = lexiconStream.numberOfEntries();
+			int numberOfUniqueTerms = index.getLexicon().numberOfEntries();
+			Iterator<Map.Entry<String, LexiconEntry>> lexiconStream = (Iterator<Map.Entry<String, LexiconEntry>>)this.index.getIndexStructureInputStream("lexicon");
+
 			// A temporary file for storing the updated
 			// lexicon file, after creating the inverted file
 			DataOutputStream dos = new DataOutputStream(Files.writeFileStream(LexiconFilename.concat(".tmp2")));
@@ -299,49 +266,50 @@
 			this.numberOfTokens = numberOfTokens;
 			this.numberOfPointers = numberOfPointers;
 			file.close();
-			lexiconStream.close();
+			
+			if (lexiconStream instanceof Closeable) {
+				((Closeable)lexiconStream).close();
+			}
 			dos.close();
 			// finalising the lexicon file with the updated information
 			// on the frequencies and the offsets
-			BlockLexiconInputStream lis = (BlockLexiconInputStream) getLexInputStream(LexiconFilename);
+//			finalising the lexicon file with the updated information
+			//on the frequencies and the offsets
 			// reading the original lexicon
-			LexiconOutputStream los = getLexOutputStream(LexiconFilename
-					.concat(".tmp3"));
+			lexiconStream = (Iterator<Map.Entry<String,LexiconEntry>>)index.getIndexStructureInputStream("lexicon");
+			
+			
 			// the updated lexicon
+			LexiconOutputStream<String> los = getLexOutputStream("tmplexicon");
+			
+			//the temporary data containing the offsets
 			DataInputStream dis = new DataInputStream(Files.openFileStream(LexiconFilename.concat(".tmp2")));
 
-			// the temporary data
-			while (lis.readNextEntryBytes() != -1) {
-				los.writeNextEntry(lis.getTermCharacters(), lis.getTermId(),
-						lis.getNt(),
-						// lis.getBlockFrequency(),
-						dis.readInt(),
-						// the term frequency
-						dis.readLong(), // the ending byte offset
-						dis.readByte());
+			while(lexiconStream.hasNext())
+			{
+				Map.Entry<String,LexiconEntry> lee = lexiconStream.next();
+				LexiconEntry value = lee.getValue();
+				value.setPosition(dis.readLong(), dis.readByte());
+				los.writeNextEntry(lee.getKey(), value);
 			}
-			lis.close();
 			los.close();
 			dis.close();
-			if (! Files.delete(LexiconFilename)) 
-				logger.error("delete file .lex failed!");
-			if (! Files.delete(LexiconFilename.concat(".tmp2"))) 
-				logger.error("delete file .lex.tmp2 failed!");
-			if (! Files.rename(LexiconFilename.concat(".tmp3"), LexiconFilename))
-				logger.error("rename file .lex.tmp3 to .lex failed!");
+			Files.delete(LexiconFilename.concat(".tmp2"));
+			MapFileLexicon.deleteMapFileLexicon("lexicon", index.getPath(), index.getPrefix());
+			MapFileLexicon.renameMapFileLexicon("tmplexicon", index.getPath(), index.getPrefix(), "lexicon", index.getPath(), index.getPrefix());
+			
+			//TODO : BlockInvertedIndexBuilder should change the Lexicon to use BasicLexiconEntry instead of BlockLexiconEntry
 
-			index.addIndexStructure("lexicon",finalLexiconClass);
-			index.addIndexStructureInputStream("lexicon",finalLexiconInputStreamClass);
 			index.addIndexStructure(
-					"inverted", 
+					structureName, 
 					"uk.ac.gla.terrier.structures.BlockInvertedIndex", 
-					"uk.ac.gla.terrier.structures.Lexicon,java.lang.String,java.lang.String", 
-					"lexicon,path,prefix");
+					"uk.ac.gla.terrier.structures.Index,java.lang.String", 
+					"index,structureName");
 			 index.addIndexStructureInputStream(
-					"inverted",
+					structureName,
 					"uk.ac.gla.terrier.structures.BlockInvertedIndexInputStream",
-					"java.lang.String,java.lang.String,uk.ac.gla.terrier.structures.LexiconInputStream",
-					"path,prefix,lexicon-inputstream");
+                    "uk.ac.gla.terrier.structures.Index,java.lang.String,java.util.Iterator",
+                    "index,structureName,lexicon-inputstream");
 			index.setIndexProperty("num.inverted.fields.bits", ""+FieldScore.FIELDS_COUNT );
 			//these should be already set, but in case their not
 			index.setIndexProperty("num.Terms", ""+numberOfUniqueTerms);
@@ -354,113 +322,19 @@
 		}
 	}
 
-	/**
-	 * Iterates through the lexicon, until it has reached the given number of
-	 * pointers
-	 * 
-	 * @param PointersToProcess
-	 *			Number of pointers to stop reading the lexicon after
-	 * @param blexiconStream
-	 *			the lexicon input stream to read
-	 * @param codesHashMap
-	 * @param tmpStorageStorage
-	 * @return
-	 */
-	protected IntLongTuple scanLexiconForPointers(final long PointersToProcess,
-			final LexiconInputStream blexiconStream,
-			final TIntIntHashMap codesHashMap, final ArrayList<TIntArrayList[]> tmpStorageStorage)
-			throws IOException {
-		final BlockLexiconInputStream lexiconStream = (BlockLexiconInputStream) blexiconStream;
-		int processTerms = 0;
-		long numberOfPointersThisIteration = 0;
-		long numberOfBlocksThisIteration = 0;
-		int j = 0; // counter of loop iterations
-		while (numberOfPointersThisIteration < PointersToProcess) {
-
-			if (lexiconStream.readNextEntry() == -1)
-				break;
-
-			processTerms++;
-
+	protected TIntArrayList[] createPointerForTerm(LexiconEntry le)
+	{
 			TIntArrayList[] tmpArray = new TIntArrayList[5];
-			final int tmpNT = lexiconStream.getNt();
+		final int tmpNT = le.getDocumentFrequency();
 			tmpArray[0] = new TIntArrayList(tmpNT);
 			tmpArray[1] = new TIntArrayList(tmpNT);
 			tmpArray[2] = new TIntArrayList(tmpNT);
 			tmpArray[3] = new TIntArrayList(tmpNT);
-			tmpArray[4] = new TIntArrayList(lexiconStream.getBlockFrequency());
-			numberOfPointersThisIteration += tmpNT;
-			numberOfBlocksThisIteration += lexiconStream.getBlockFrequency();
-
-			tmpStorageStorage.add(tmpArray);
-
-			// the class TIntIntHashMap return zero when you look up for a
-			// the value of a key that does not exist in the hash map.
-			// For this reason, the values that will be inserted in the
-			// hash map are increased by one.
-			codesHashMap.put(lexiconStream.getTermId(), j + 1);
-
-			// increment counter
-			j++;
-		}
-		if(logger.isDebugEnabled()){
-			logger.debug(numberOfPointersThisIteration + " pointers == "
-				+ processTerms + " terms == " + numberOfBlocksThisIteration
-				+ " blocks");
-		}
-		return new IntLongTuple(processTerms, numberOfPointersThisIteration);
+		tmpArray[4] = new TIntArrayList(((BlockTermStatistics)le).getBlockCount());
+		return tmpArray;
 	}
 
-	/**
-	 * Iterates through the lexicon, until it has reached the given number of
-	 * terms
-	 * 
-	 * @param processTerms
-	 *			Number of terms to stop reading the lexicon after
-	 * @param blexiconStream
-	 *			the lexicon input stream to read
-	 * @param codesHashMap
-	 * @param tmpStorageStorage
-	 * @return
-	 */
-	protected IntLongTuple scanLexiconForTerms(final int processTerms,
-			final LexiconInputStream blexiconStream,
-			final TIntIntHashMap codesHashMap, TIntArrayList[][] tmpStorage)
-			throws IOException {
-		final BlockLexiconInputStream lexiconStream = (BlockLexiconInputStream) blexiconStream;
-		int j = 0;
-		long numberOfBlocksThisIteration = 0;
-		long numberOfPointersThisIteration = 0;
-		for (; j < processTerms; j++) {
-
-			if (lexiconStream.readNextEntry() == -1)
-				break;
-
-			TIntArrayList[] tmpArray = new TIntArrayList[5];
-			final int tmpNT = lexiconStream.getNt();
-			tmpArray[0] = new TIntArrayList(tmpNT);
-			tmpArray[1] = new TIntArrayList(tmpNT);
-			tmpArray[2] = new TIntArrayList(tmpNT);
-			tmpArray[3] = new TIntArrayList(tmpNT);
-			tmpArray[4] = new TIntArrayList(lexiconStream.getBlockFrequency());
-
-			numberOfPointersThisIteration += tmpNT;
-			numberOfBlocksThisIteration += lexiconStream.getBlockFrequency();
 
-			tmpStorage[j] = tmpArray;
-
-			// the class TIntIntHashMap return zero when you look up for a
-			// the value of a key that does not exist in the hash map.
-			// For this reason, the values that will be inserted in the
-			// hash map are increased by one.
-			codesHashMap.put(lexiconStream.getTermId(), j + 1);
-		}
-		if(logger.isDebugEnabled()){
-			logger.debug(numberOfPointersThisIteration + " pointers == " + j
-				+ " terms == " + numberOfBlocksThisIteration + " blocks");
-		}
-		return new IntLongTuple(j, numberOfPointersThisIteration);
-	}
 
 	/**
 	 * Traverses the direct fies recording all occurrences of terms noted in
@@ -481,10 +355,7 @@
 		// scan the direct file
 		//BlockDirectIndexInputStream directInputStream = new BlockDirectIndexInputStream(
 		//		indexPath, indexPrefix);
-		BlockDirectIndexInputStream directInputStream =
-			index != null
-				? (BlockDirectIndexInputStream)index.getIndexStructureInputStream("direct")
-				: new BlockDirectIndexInputStream(indexPath, indexPrefix);
+		BlockDirectIndexInputStream directInputStream = (BlockDirectIndexInputStream)index.getIndexStructureInputStream("direct");
 		int[][] documentTerms = null;
 		int p = 0; // a document counter;
 		while ((documentTerms = directInputStream.getNextTerms()) != null) {
@@ -581,6 +452,9 @@
 			tmpMatrix = null;
 			tmpStorage[j] = null;
 
+			dos.writeLong(file.getByteOffset());
+			dos.writeByte(file.getBitOffset());
+
 			// write the first entry
 			int docid = tmpMatrix0[0];
 			file.writeGamma(docid + 1);
@@ -615,17 +489,17 @@
 					blockindex++;
 				}
 			}
-			long endOffset = file.getByteOffset();
-			byte endBitOffset = file.getBitOffset();
-			endBitOffset--;
-			if (endBitOffset < 0 && endOffset > 0) {
-				endBitOffset = 7;
-				endOffset--;
-			}
+			//long endOffset = file.getByteOffset();
+			//byte endBitOffset = file.getBitOffset();
+			//endBitOffset--;
+			//if (endBitOffset < 0 && endOffset > 0) {
+			//	endBitOffset = 7;
+			//	endOffset--;
+			//}
 			numTokens += frequency;
-			dos.writeInt(frequency);
-			dos.writeLong(endOffset);
-			dos.writeByte(endBitOffset);
+			//dos.writeInt(frequency);
+			//dos.writeLong(endOffset);
+			//dos.writeByte(endBitOffset);
 
 			// dereference the arrays so they can be destroyed by GC
 			tmpMatrix0 = tmpMatrix1 = tmpMatrix2 = tmpMatrix3 = tmpMatrix4 = null;
Index: src/uk/ac/gla/terrier/structures/indexing/BlockLexiconBuilder.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/indexing/BlockLexiconBuilder.java,v
retrieving revision 1.32
diff -w -u -r1.32 BlockLexiconBuilder.java
--- src/uk/ac/gla/terrier/structures/indexing/BlockLexiconBuilder.java	28 Jan 2009 20:16:57 -0000	1.32
+++ src/uk/ac/gla/terrier/structures/indexing/BlockLexiconBuilder.java	26 Feb 2009 16:11:47 -0000
@@ -26,299 +26,18 @@
  *   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> 
  */
 package uk.ac.gla.terrier.structures.indexing;
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.PriorityQueue;
-
-import uk.ac.gla.terrier.structures.BlockLexiconInputStream;
-import uk.ac.gla.terrier.structures.BlockLexiconOutputStream;
 import uk.ac.gla.terrier.structures.Index;
-import uk.ac.gla.terrier.structures.Lexicon;
-import uk.ac.gla.terrier.structures.LexiconInputStream;
-import uk.ac.gla.terrier.structures.LexiconOutputStream;
-import uk.ac.gla.terrier.utility.ApplicationSetup;
 /**
  * Builds a block lexicon using block frequencies.
- * @author Douglas Johnson, Vassilis Plachouras &amp; Craig Macdonald
+ * @author Craig Macdonald
  * @version $Revision: 1.32 $
  */
 public class BlockLexiconBuilder extends LexiconBuilder
 {
-	
-	
-	/**
-	 * A default constructor of the class. The block lexicon is built in the 
-	 * default path and file: ApplicationSetup.TERRIER_INDEX_PATH and 
-	 * ApplicationSetup.TERRIER_INDEX_PREFIX respectively.
-	 */
-	public BlockLexiconBuilder()
-	{
-		this(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX);
-	}
-
-	/**
-	 * Creates an instance of the class, given the path
-	 * to save the final and temporary lexicons.
-	 * @param pathname String the path to save the temporary lexicons.
-	 */
-	public BlockLexiconBuilder(String pathname, String prefix) {
-		super(pathname, prefix);
-		LexiconMapClass = BlockLexiconMap.class;
-		lexiconOutputStream = BlockLexiconOutputStream.class;
-		lexiconInputStream = BlockLexiconInputStream.class;
-		try{ TempLex = (LexiconMap) LexiconMapClass.newInstance(); } catch (Exception e) {logger.error(e);}
-	}
-	
-	public BlockLexiconBuilder(Index i)
+	public BlockLexiconBuilder(Index i, String _structureName)
 	{
-		super(i);
-		LexiconMapClass = BlockLexiconMap.class;
-		lexiconOutputStream = BlockLexiconOutputStream.class;
-		lexiconInputStream = BlockLexiconInputStream.class;
-		try{ TempLex = (LexiconMap) LexiconMapClass.newInstance(); } catch (Exception e) {logger.error(e);}
-	}
-
-	/**
-	 * The method that performs processing of the lexicon after the
-	 * creation of the direct index has been completed. It flushes to 
-	 * disk the current temporary lexicon, and it starts the merging
-	 * of the temporary lexicons and the creation of the lexicon index. 
-	 */
-	public void finishedDirectIndexBuild()
-	{
-		logger.info("flushing block lexicon to disk after the direct index completed");
-		 //only write a temporary lexicon if there are any items in it
-		if (TempLex.getNumberOfNodes() > 0)
-			writeTemporaryLexicon();
-		TempLex = null;
-
-		//merges the temporary lexicons
-		if (tempLexFiles.size() > 0)
-			try{
-				merge(tempLexFiles);
-	
-				//creates the offsets file
-				final String lexiconFilename = 
-							indexPath + ApplicationSetup.FILE_SEPARATOR + 
-							indexPrefix + ApplicationSetup.LEXICONSUFFIX;
-				LexiconInputStream lis = getLexInputStream(lexiconFilename);
-				createLexiconIndex(
-						lis,
-						lis.numberOfEntries(),
-						/* after inverted index is built, the lexicon will be transformed into a
-						 * normal lexicon, without block frequency */
-						Lexicon.lexiconEntryLength 
-						);
-				TermCount = lis.numberOfEntries();
-				if (index != null)
-				{
-					index.addIndexStructure("lexicon", "uk.ac.gla.terrier.structures.BlockLexicon");
-					index.addIndexStructureInputStream("lexicon", "uk.ac.gla.terrier.structures.BlockLexiconInputStream");
-					index.setIndexProperty("num.Terms", ""+lis.numberOfEntries());
-					index.setIndexProperty("num.Pointers", ""+lis.getNumberOfPointersRead());
-				}
-			} catch(IOException ioe){
-				logger.error("Indexing failed to merge temporary lexicons to disk. ",ioe);
-			}
-		else
-			logger.warn("No temporary lexicons to merge, skipping");
-	}
-
-	/** Merge the two LexiconInputStreams into the given LexiconOutputStream
-	  * @param lis1 First lexicon to be merged
-	  * @param lis2 Second lexicon to be merged
-	  * @param los Lexion to be merged to
-	  */
-	protected void mergeTwoLexicons(
-			LexiconInputStream blis1,
-			LexiconInputStream blis2,
-			LexiconOutputStream blos) throws IOException
-	{
-		final BlockLexiconInputStream lis1 = (BlockLexiconInputStream)blis1;
-		final BlockLexiconInputStream lis2 = (BlockLexiconInputStream)blis2;
-		final BlockLexiconOutputStream los = (BlockLexiconOutputStream)blos;
-
-		boolean hasMore1 = true;
-		boolean hasMore2 = true;
-		int termID1 = 0;
-		int termID2 = 0;
-		hasMore1 = (lis1.readNextEntry()!=-1);
-		hasMore2 = (lis2.readNextEntry()!=-1);
-		String sTerm1 = null;
-		String sTerm2 = null;
-		if (hasMore1) {
-			termID1 = lis1.getTermId();
-			sTerm1 = lis1.getTerm();
-		}
-		if (hasMore2) {
-			termID2 = lis2.getTermId();
-			sTerm2 = lis2.getTerm();
-		}
-		while (hasMore1 && hasMore2) {
-			int compareString = 0;
-			if (termID1 != termID2)
-			{
-				compareString = sTerm1.compareTo(sTerm2);
-				if (compareString == 0)//, but termids don't match
-				{
-					logger.error("Term "+sTerm1+" had two termids ("+ termID1+","+termID2+")");
-				}
-			}
-			
-			if (compareString <0) {
-				los.writeNextEntry(sTerm1, termID1, lis1.getNt(), lis1.getBlockFrequency(), lis1.getTF(), lis1.getEndOffset(), lis1.getEndBitOffset());
-				hasMore1 = (lis1.readNextEntry()!=-1);
-				if (hasMore1) {
-					termID1 = lis1.getTermId();
-					sTerm1 = lis1.getTerm();
-				}
-			} else if (compareString >0) {
-				los.writeNextEntry(sTerm2, termID2, lis2.getNt(), lis2.getBlockFrequency(), lis2.getTF(), lis2.getEndOffset(), lis2.getEndBitOffset());
-				hasMore2 = (lis2.readNextEntry()!=-1);
-				if (hasMore2) {
-					termID2 = lis2.getTermId();
-					sTerm2 = lis2.getTerm();
-				}
-			} else /*if (compareString == 0)*/ {
-				los.writeNextEntry(
-					sTerm1, 
-					termID1, 
-					lis1.getNt() + lis2.getNt(),
-					lis1.getBlockFrequency() + lis2.getBlockFrequency(),
-					lis1.getTF() + lis2.getTF(),  							 
-					0, //inverted index not built yet
-					(byte)0 //inverted index not built yet
-				);
-		
-				hasMore1 = (lis1.readNextEntry()!=-1);
-				hasMore2 = (lis2.readNextEntry()!=-1);
-				if (hasMore1) {
-					termID1 = lis1.getTermId();
-					sTerm1 = lis1.getTerm();
-				}
-				if (hasMore2) {
-					termID2 = lis2.getTermId();
-					sTerm2 = lis2.getTerm();
-				}
-			}
-		}
-		if (hasMore1) {
-			lis2.close();
-
-			while (hasMore1) {
-				los.writeNextEntry(sTerm1, termID1, lis1.getNt(), lis1.getBlockFrequency(), lis1.getTF(), lis1.getEndOffset(), lis1.getEndBitOffset());
-				hasMore1 = (lis1.readNextEntry()!=-1);
-				if (hasMore1) {
-					termID1 = lis1.getTermId();
-					sTerm1 = lis1.getTerm();
-				}
-			}
-
-			//close input file 1 stream
-			lis1.close();
-			
-		} else if (hasMore2) {
-			lis1.close();
-
-			while (hasMore2) {
-				los.writeNextEntry(sTerm2, termID2, lis2.getNt(), lis2.getBlockFrequency(), lis2.getTF(), lis2.getEndOffset(), lis2.getEndBitOffset());
-				hasMore2 = (lis2.readNextEntry()!=-1);
-				if (hasMore2) {
-					termID2 = lis2.getTermId();
-					sTerm2 = lis2.getTerm();
-				}
-			}
-			//close input file 2 stream
-			lis2.close();
-		}
-		//closing ouptut lexicon stream
-		los.close();	
-	}
-	
-	protected void mergeNLexicons(final LexiconInputStream[] _lis, final LexiconOutputStream _los) throws IOException
-	{
-		final int numLexicons = _lis.length;
-		long totalTokens = 0;
-		long totalPointers = 0;
-		final int hasMore[] = new int[numLexicons];
-		Arrays.fill(hasMore, -1);
-		final PriorityQueue<String> terms = new PriorityQueue<String>(numLexicons);
-		final BlockLexiconOutputStream los = (BlockLexiconOutputStream)_los;
-		final BlockLexiconInputStream[] lis = new BlockLexiconInputStream[numLexicons];
-		
-		for(int i=0;i<numLexicons;i++)
-		{
-			lis[i] = (BlockLexiconInputStream) _lis[i];
-			hasMore[i] = lis[i].readNextEntry();
-			terms.add(lis[i].getTerm());	
-		}
-		int Tf = 0; int Nt = 0; int Bf = 0; String targetTerm= null;
-		int targetTermId  = -1;
-		while(terms.size() > 0)
-		{
-			//what term are we working on
-			targetTerm = terms.poll();
-			//logger.debug("Current term is "+targetTerm + "length="+targetTerm.length());
-			//for each input lexicon
-			for(int i=0;i<numLexicons;i++)
-			{
-				//does this lexicon contain the term
-				//logger.debug("Checking lexicon "+i+" for "+targetTerm+"="+lis[i].getTerm());
-				if(hasMore[i] != -1 && lis[i].getTerm().equals(targetTerm))
-				{
-					if (targetTermId == -1)
-					{	//obtain the termid for this term from the first lexicon that has the term
-						targetTermId = lis[i].getTermId();
-					}
-					else if (targetTermId != lis[i].getTermId())
-					{	//check the termids match for this term
-						logger.error("Term "+targetTerm+" had two termids ("+targetTermId+","+lis[i].getTermId()+")");
-					}
-					//logger.debug("Term "+targetTerm + " found in "+i + "termid="+ lis[i].getTermId());
-					Tf += lis[i].getTF();
-					Nt += lis[i].getNt();
-					Bf += lis[i].getBlockFrequency();
-					hasMore[i] = lis[i].readNextEntry();
-					if (hasMore[i] != -1)
-					{
-						terms.add(lis[i].getTerm());
-						//break;
-					}
-					break;
-				}
-			}
-			if (terms.size()>0 && !terms.peek().equals(targetTerm))
-			{
-				if (targetTermId == -1)
-				{
-					logger.error("Term "+ targetTerm + " not found in any lexicons");
-				}
-				//end of this term, so we can write the lexicon entry
-				totalTokens += Tf;
-				totalPointers += Nt;
-				los.writeNextEntry(targetTerm, targetTermId, Nt, Tf, Bf, 0, (byte)0);
-				Bf = 0; Tf = Nt = 0; targetTermId = -1; targetTerm = null;
-			}
-		}
-		totalTokens += Tf;
-		totalPointers += Nt;
-		if (targetTermId != -1)
-			los.writeNextEntry(targetTerm, targetTermId, Nt, Tf, Bf, 0, (byte)0);
-		los.close();
-		for(int i=0;i<numLexicons;i++)
-			lis[i].close();
-	}
-
-
-	public static void main(String args[]) {
-		String path = args[0];
-		String prefix = args[1];
-		BlockLexiconBuilder blb = new BlockLexiconBuilder(path, prefix);
-		
-		String lexiconFilename = path + ApplicationSetup.FILE_SEPARATOR + prefix + ApplicationSetup.LEXICONSUFFIX;
-		LexiconInputStream lexStream = new LexiconInputStream(lexiconFilename);
-		blb.createLexiconHash(lexStream);
-		
+		super(i, _structureName, 
+			BlockLexiconMap.class, 
+			"uk.ac.gla.terrier.structures.BlockLexiconEntry");
 	}
-	
 }
Index: src/uk/ac/gla/terrier/structures/indexing/BlockLexiconMap.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/indexing/BlockLexiconMap.java,v
retrieving revision 1.5
diff -w -u -r1.5 BlockLexiconMap.java
--- src/uk/ac/gla/terrier/structures/indexing/BlockLexiconMap.java	28 Jan 2009 20:16:57 -0000	1.5
+++ src/uk/ac/gla/terrier/structures/indexing/BlockLexiconMap.java	26 Feb 2009 16:11:47 -0000
@@ -27,17 +27,23 @@
  */
 package uk.ac.gla.terrier.structures.indexing;
 
-import uk.ac.gla.terrier.structures.LexiconOutputStream;
-import uk.ac.gla.terrier.structures.BlockLexiconOutputStream;
-import gnu.trove.*;
-import java.util.Arrays;
+import gnu.trove.TObjectIntHashMap;
+import gnu.trove.TObjectIntProcedure;
+
 import java.io.IOException;
+import java.util.Arrays;
+
+import uk.ac.gla.terrier.structures.BlockLexiconEntry;
+import uk.ac.gla.terrier.structures.LexiconOutputStream;
 import uk.ac.gla.terrier.utility.TermCodes;
 
 /** LexiconMap implementation that also keeps track of the number of blocks that a term occurrs in.
   * This is useful for sizing the block inverted index */
 public class BlockLexiconMap extends LexiconMap
 {
+	protected static final byte zerob = (byte)0;
+	protected static final long zerol = (long)0;
+	
 	/** Total number of blocks in this index */
 	protected long numberOfBlocks = 0;
 	/** Mapping term to blocks */
@@ -86,15 +92,13 @@
 
 	/** Stores the lexicon map to a lexicon stream as a sequence of entries.
 	  * @param _lexiconStream The lexicon output stream to store to. */
-	public void storeToStream(final LexiconOutputStream _lexiconStream) throws IOException {
-		final BlockLexiconOutputStream lexiconStream = (BlockLexiconOutputStream)_lexiconStream;
-		final byte zerob = (byte)0;
-		final long zerol = (long)0;
+	public void storeToStream(final LexiconOutputStream<String> lexiconStream) throws IOException {
+		
 		final String[] terms = tfs.keys(new String[0]);
 		Arrays.sort(terms);
 		for (String t : terms)
 		{
-			lexiconStream.writeNextEntry(t, TermCodes.getCode(t), nts.get(t), tfs.get(t), blockFreqs.get(t), zerol, zerob);
+			lexiconStream.writeNextEntry(t, new BlockLexiconEntry(TermCodes.getCode(t), nts.get(t), tfs.get(t), zerol, zerob, blockFreqs.get(t)));
 		}
 	}
 
Index: src/uk/ac/gla/terrier/structures/indexing/InvertedIndexBuilder.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/indexing/InvertedIndexBuilder.java,v
retrieving revision 1.41
diff -w -u -r1.41 InvertedIndexBuilder.java
--- src/uk/ac/gla/terrier/structures/indexing/InvertedIndexBuilder.java	28 Jan 2009 20:16:58 -0000	1.41
+++ src/uk/ac/gla/terrier/structures/indexing/InvertedIndexBuilder.java	26 Feb 2009 16:11:47 -0000
@@ -33,15 +33,22 @@
 import java.io.DataOutputStream;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.Map;
 
+import org.apache.hadoop.io.Text;
 import org.apache.log4j.Logger;
+
 import uk.ac.gla.terrier.compression.BitOut;
 import uk.ac.gla.terrier.compression.BitOutputStream;
+import uk.ac.gla.terrier.structures.Closeable;
 import uk.ac.gla.terrier.structures.DirectIndexInputStream;
-import uk.ac.gla.terrier.structures.DocumentIndex;
 import uk.ac.gla.terrier.structures.Index;
-import uk.ac.gla.terrier.structures.LexiconInputStream;
+import uk.ac.gla.terrier.structures.LexiconEntry;
 import uk.ac.gla.terrier.structures.LexiconOutputStream;
+import uk.ac.gla.terrier.structures.MapFileLexicon;
+import uk.ac.gla.terrier.structures.MapFileLexiconOutputStream;
+import uk.ac.gla.terrier.structures.seralization.FixedSizeWriteableFactory;
 import uk.ac.gla.terrier.utility.ApplicationSetup;
 import uk.ac.gla.terrier.utility.FieldScore;
 import uk.ac.gla.terrier.utility.Files;
@@ -84,8 +91,6 @@
  */
 public class InvertedIndexBuilder {
 
-	/** class to be used as a lexiconinputstream. set by this and child classes */
-	protected Class lexiconInputStream = null;
 	/** class to be used as a lexiconoutpustream. set by this and child classes */
 	protected Class lexiconOutputStream = null;
 
@@ -104,13 +109,6 @@
 		}
 	}
 
-	/** the directory in which index files should be created */
-	protected String indexPath;
-	/** the first part of the filename component of index files */
-	protected String indexPrefix;
-	
-	protected String indexPathPrefix;
-	
 	/** The number of unique terms in the vocabulary.*/
 	public int numberOfUniqueTerms;
 	
@@ -128,6 +126,8 @@
 	
 	protected Index index = null;
 	
+	protected String structureName = null;
+	
 	/** The number of pointers to be processed in an interation. This directly corresponds to the
 	  * property <tt>invertedfile.processpointers</tt>. If this property is set and > 0, then each
 	  * iteration of the inverted index creation will be done to a set number of pointers, not a set
@@ -140,62 +140,20 @@
 	 */
 	protected BitOut file;
 
-	/**
-	 * Constructor of the class InvertedIndex.
-	 * @deprecated
-	 */
-	public InvertedIndexBuilder(String Path, String Prefix)
-	{
-		indexPath = Path; indexPrefix = Prefix;
-		indexPathPrefix = indexPath + ApplicationSetup.FILE_SEPARATOR + indexPrefix;
-		try{
-			file = new BitOutputStream(indexPathPrefix + ApplicationSetup.IFSUFFIX);
-		} catch (IOException ioe) {
-			logger.error("creating BitOutputStream for writing the inverted file : ", ioe);
-		}
-		lexiconInputStream = LexiconInputStream.class;
-		lexiconOutputStream = LexiconOutputStream.class;
-	}
-	
-	public InvertedIndexBuilder(Index i)
+	public InvertedIndexBuilder(Index i, String _structureName)
 	{
 		this.index = i;
-		indexPath = index.getPath(); indexPrefix = index.getPrefix();
-		indexPathPrefix = indexPath + ApplicationSetup.FILE_SEPARATOR + indexPrefix;
+		this.structureName = _structureName;
+		
 		try{
-			file = new BitOutputStream(indexPathPrefix + ApplicationSetup.IFSUFFIX);
+			file = new BitOutputStream(index.getPath() + "/"+ index.getPrefix() + "." +structureName + ".bf");
 		} catch (IOException ioe) {
 			logger.error("creating BitOutputStream for writing the inverted file : ", ioe);
 		}
-		lexiconInputStream = LexiconInputStream.class;
 		lexiconOutputStream = LexiconOutputStream.class;
 	}
 
 
-	/**
-	 * A default constructor of the class InvertedIndex.
-	 * @deprecated
-	 */
-	public InvertedIndexBuilder() {
-		this(ApplicationSetup.TERRIER_INDEX_PATH,
-			ApplicationSetup.TERRIER_INDEX_PREFIX);
-	}
-
-	/**
-	 * Creates an instance of the InvertedIndex
-	 * class using the given filename.
-	 * @param filename The name of the inverted file
-	 * @deprecated Use this() or this(String, String)
-	 */
-	public InvertedIndexBuilder(String filename) {
-		try{
-			file = new BitOutputStream(filename);
-		} catch (IOException ioe) {
-			logger.error("Creating BitOutputStream for writing the direct file : ", ioe);
-		}
-		lexiconInputStream = LexiconInputStream.class;
-		lexiconOutputStream = LexiconOutputStream.class;
-	}
 
 	/**
 	 * Closes the underlying bit file.
@@ -208,21 +166,22 @@
 	 * Creates the inverted index using the already created direct index,
 	 * document index and lexicon.
 	 */
+	@SuppressWarnings("unchecked")
 	public void createInvertedIndex() {
 		try {
 			Runtime r = Runtime.getRuntime();
 			logger.debug("creating inverted index");
-			final String LexiconFilename = indexPathPrefix + ApplicationSetup.LEXICONSUFFIX;
+			final String LexiconFilename = index.getPath() + "/" + index.getPrefix() + ".lexicon";
 			
 			final int numberOfDocuments = index.getCollectionStatistics().getNumberOfDocuments();
 		
 			long assumedNumberOfPointers = Long.parseLong(index.getIndexProperty("num.Pointers", "0"));				
 			long numberOfTokens = 0;
 			long numberOfPointers = 0;
+			int numberOfUniqueTerms = index.getLexicon().numberOfEntries();
+			Iterator<Map.Entry<String,LexiconEntry>> lexiconStream = 
+				(Iterator<Map.Entry<String,LexiconEntry>>)index.getIndexStructureInputStream("lexicon");
 		
-			LexiconInputStream lexiconStream = getLexInputStream(LexiconFilename);
-			numberOfUniqueTerms = lexiconStream.numberOfEntries();
-			final int fieldsCount = FieldScore.FIELDS_COUNT;
 			//A temporary file for storing the updated lexicon file, after
 			// creating the inverted file
 			DataOutputStream dos = new DataOutputStream(Files.writeFileStream(LexiconFilename.concat(".tmp2")));
@@ -374,52 +333,56 @@
 			this.numberOfUniqueTerms = numberOfUniqueTerms;
 			this.numberOfPointers = numberOfPointers;
 
-			lexiconStream.close();
+			if (lexiconStream instanceof Closeable) {
+				((Closeable)lexiconStream).close();
+			}
 			dos.close();
 			//finalising the lexicon file with the updated information
 			//on the frequencies and the offsets
-			
 			//reading the original lexicon
-			LexiconInputStream lis = getLexInputStream(LexiconFilename);
+			lexiconStream = (Iterator<Map.Entry<String,LexiconEntry>>)index.getIndexStructureInputStream("lexicon");
+			
 			
 			//the updated lexicon
-			LexiconOutputStream los = getLexOutputStream(LexiconFilename.concat(".tmp3"));
+			LexiconOutputStream<String> los = getLexOutputStream("tmplexicon");
 			
 			//the temporary data containing the offsets
 			DataInputStream dis = new DataInputStream(Files.openFileStream(LexiconFilename.concat(".tmp2")));
 			
-			while (lis.readNextEntryBytes() != -1) {
-				los.writeNextEntry(lis.getTermCharacters(), lis.getTermId(),
-						lis.getNt(), 
-						dis.readInt(), //the term frequency
-						dis.readLong(), //end byte offset
-						dis.readByte());//end bit offset
+			while(lexiconStream.hasNext())
+			{
+				Map.Entry<String,LexiconEntry> lee = lexiconStream.next();
+				LexiconEntry value = lee.getValue();
+				value.setPosition(dis.readLong(), dis.readByte());
+				los.writeNextEntry(lee.getKey(), value);
+			}
+			if (lexiconStream instanceof Closeable) {
+				((Closeable)lexiconStream).close();
 			}
-			lis.close();
 			los.close();
 			dis.close();
-            if (! Files.delete(LexiconFilename))
-                logger.error("delete file .lex failed!");
-            if (! Files.delete(LexiconFilename.concat(".tmp2")))
-                logger.error("delete file .lex.tmp2 failed!");
-            if (! Files.rename(LexiconFilename.concat(".tmp3"), LexiconFilename))
-                logger.error("rename file .lex.tmp3 to .lex failed!");
+			Files.delete(LexiconFilename.concat(".tmp2"));
+			MapFileLexicon.deleteMapFileLexicon("lexicon", index.getPath(), index.getPrefix());
+			MapFileLexicon.renameMapFileLexicon(
+					"tmplexicon", index.getPath(), index.getPrefix(), 
+					"lexicon", index.getPath(), index.getPrefix());
 			
 			index.addIndexStructure(
-					"inverted", 
+					structureName, 
 					"uk.ac.gla.terrier.structures.InvertedIndex", 
-					"uk.ac.gla.terrier.structures.Lexicon,java.lang.String,java.lang.String", 
-					"lexicon,path,prefix");
+					"uk.ac.gla.terrier.structures.Index,java.lang.String", 
+					"index,structureName");
 			index.addIndexStructureInputStream(
-                    "inverted",
+					structureName,
                     "uk.ac.gla.terrier.structures.InvertedIndexInputStream",
-                    "java.lang.String,java.lang.String,uk.ac.gla.terrier.structures.LexiconInputStream",
-                    "path,prefix,lexicon-inputstream");
+                    "uk.ac.gla.terrier.structures.Index,java.lang.String,java.util.Iterator",
+                    "index,structureName,lexicon-inputstream");
 			index.setIndexProperty("num.inverted.fields.bits", ""+FieldScore.FIELDS_COUNT );
 			//should be already set, but in case their not
 			index.setIndexProperty("num.Terms", ""+numberOfUniqueTerms);
 			index.setIndexProperty("num.Tokens", ""+numberOfTokens);
 			index.setIndexProperty("num.Pointers", ""+numberOfPointers);
+			index.flush();
 			System.gc();
 			
 		} catch (IOException ioe) {
@@ -427,6 +390,16 @@
 		}
 	}
 	
+	protected TIntArrayList[] createPointerForTerm(LexiconEntry le)
+	{
+		TIntArrayList[] tmpArray = new TIntArrayList[3];
+		final int tmpNT = le.getDocumentFrequency();
+		tmpArray[0] = new TIntArrayList(tmpNT);
+		tmpArray[1] = new TIntArrayList(tmpNT);
+		tmpArray[2] = new TIntArrayList(tmpNT);
+		return tmpArray;
+	}
+	
 	/** Iterates through the lexicon, until it has reached the given number of pointers
 	  * @param PointersToProcess Number of pointers to stop reading the lexicon after
 	  * @param lexiconStream the lexicon input stream to read 
@@ -436,7 +409,7 @@
 	  */
 	protected IntLongTuple scanLexiconForPointers(
 		final long PointersToProcess, 
-		final LexiconInputStream lexiconStream, 
+		final Iterator<Map.Entry<String,LexiconEntry>> lexiconStream, 
 		final TIntIntHashMap codesHashMap,
 		final ArrayList<TIntArrayList[]> tmpStorageStorage)
 		throws IOException
@@ -446,27 +419,21 @@
 		int j=0; //counter of loop iterations
 		while(numberOfPointersThisIteration < PointersToProcess) {
 		
-			if (lexiconStream.readNextEntry() == -1)
+			if (! lexiconStream.hasNext())
 				break;
 			
-			processTerms++;
-			
-			TIntArrayList[] tmpArray = new TIntArrayList[3];
-			final int tmpNT = lexiconStream.getNt();
-			tmpArray[0] = new TIntArrayList(tmpNT);
-			tmpArray[1] = new TIntArrayList(tmpNT);
-			tmpArray[2] = new TIntArrayList(tmpNT);
-			
-			numberOfPointersThisIteration += tmpNT;
-			
+			Map.Entry<String,LexiconEntry> lee = lexiconStream.next();
+			LexiconEntry le = lee.getValue();
 			
-			tmpStorageStorage.add(tmpArray);
+			processTerms++;			
+			numberOfPointersThisIteration += le.getDocumentFrequency();		
+			tmpStorageStorage.add(createPointerForTerm(le));
 			
 			//the class TIntIntHashMap return zero when you look up for a
 			//the value of a key that does not exist in the hash map.
 			//For this reason, the values that will be inserted in the 
 			//hash map are increased by one. 
-			codesHashMap.put(lexiconStream.getTermId(), j + 1);
+			codesHashMap.put(le.getTermId(), j + 1);
 			
 			//increment counter
 			j++;
@@ -488,7 +455,7 @@
 	  */
 	protected IntLongTuple scanLexiconForTerms(
 		final int processTerms, 
-		final LexiconInputStream lexiconStream, 
+		final Iterator<Map.Entry<String,LexiconEntry>> lexiconStream, 
 		final TIntIntHashMap codesHashMap,
 		TIntArrayList[][] tmpStorage)
 		throws IOException
@@ -498,11 +465,14 @@
 		long numberOfPointersThisIteration = 0;
 		for (; j < processTerms; j++) {
 		
-			if (lexiconStream.readNextEntry() == -1)
+			if (! lexiconStream.hasNext())
 				break;
 		
+			Map.Entry<String,LexiconEntry> lee = lexiconStream.next();
+			LexiconEntry le = lee.getValue();
+		
 			TIntArrayList[] tmpArray = new TIntArrayList[3];
-			final int tmpNT = lexiconStream.getNt();
+			final int tmpNT = le.getDocumentFrequency();
 			tmpArray[0] = new TIntArrayList(tmpNT);
 			tmpArray[1] = new TIntArrayList(tmpNT);
 			tmpArray[2] = new TIntArrayList(tmpNT);
@@ -516,7 +486,7 @@
 			//the value of a key that does not exist in the hash map.
 			//For this reason, the values that will be inserted in the 
 			//hash map are increased by one. 
-			codesHashMap.put(lexiconStream.getTermId(), j + 1);
+			codesHashMap.put(le.getTermId(), j + 1);
 		}
 		if (logger.isDebugEnabled())
 			logger.debug(
@@ -538,10 +508,7 @@
 		throws IOException 
 	{
 		//scan the direct file
-		DirectIndexInputStream directInputStream =
-			index != null
-				? (DirectIndexInputStream)index.getIndexStructureInputStream("direct")
-				: new DirectIndexInputStream(indexPath, indexPrefix);
+		DirectIndexInputStream directInputStream = (DirectIndexInputStream)index.getIndexStructureInputStream("direct");
 		int[][] documentTerms = null;
 		int p = 0; //a document counter;
 		final boolean useFieldInformation = this.useFieldInformation;
@@ -582,8 +549,7 @@
 	protected void traverseDirectFile(int[][][] tmpStorage, int[] indices, TIntIntHashMap codesHashMap) 
 		throws IOException
 	{
-		DirectIndexInputStream directInputStream = new DirectIndexInputStream(
-			indexPath, indexPrefix);
+		DirectIndexInputStream directInputStream = (DirectIndexInputStream)index.getIndexStructureInputStream("direct");
 		int[][] documentTerms = null;
 		int[] documentTerms0 = null;
 		int[] documentTerms1 = null;
@@ -652,14 +618,13 @@
 		throws IOException
 	{
 		//write to the inverted file. We should note that the lexicon 
-		//file should be updated as well with the term frequency and
-		//the endOffset and endBitOffset.
-		
-		//remove this, as it now happens at the end of this method
-		//the first call is made at the start of createInvertedIndex
-		//file.writeReset();
+		//should be updated with the start bit and byte offset for this
+		//set of postings.
 		int frequency; long numTokens = 0;
 		for (int j = 0; j < processTerms; j++) {
+			dos.writeLong(file.getByteOffset());
+			dos.writeByte(file.getBitOffset());
+			
 			frequency = 0; //the term frequency
 			TIntArrayList[] tmpMatrix = tmpStorage[j];
 			final int[] tmpMatrix0 = tmpMatrix[0].toNativeArray();
@@ -705,21 +670,17 @@
 				}
 			}
 			
-			long endOffset = file.getByteOffset();
-			byte endBitOffset = file.getBitOffset();
-			endBitOffset--;
-			if (endBitOffset < 0 && endOffset > 0) {
-				endBitOffset = 7;
-				endOffset--;
-			}
+			//long endOffset = file.getByteOffset();
+			//byte endBitOffset = file.getBitOffset();
+			//endBitOffset--;
+			//if (endBitOffset < 0 && endOffset > 0) {
+			//	endBitOffset = 7;
+			//	endOffset--;
+			//}
 			numTokens += frequency;
-			dos.writeInt(frequency);
-			dos.writeLong(endOffset);
-			dos.writeByte(endBitOffset);
-		}
-		//file.writeFlush();
-		//we have to force a reset here, as otherwise the buffer isn't cleared.
-		//file.writeReset();
+			//dos.writeInt(frequency);
+			
+		}
 		return numTokens;
 	}
 	
@@ -734,124 +695,6 @@
 	 */
 	protected int processTerms = Integer.parseInt(ApplicationSetup.getProperty("invertedfile.processterms", "75000"));
 	
-	/*
-	for (int i = 0; i < numberOfUniqueTerms; i = i + processTerms) {
-		//set the number of terms to process from the lexicon
-		if ((i + processTerms) > numberOfUniqueTerms)
-			processTerms = (int) numberOfUniqueTerms - i;
-		//start processing part of the lexicon
-		startProcessingLexicon = System.currentTimeMillis();
-		//preparing the data structures to store the data
-		int[] indices = new int[processTerms];
-		int[][][] tmpStorage = new int[processTerms][][];
-		TIntIntHashMap codesHashMap = new TIntIntHashMap(processTerms);
-		int numberOfPointersPerIteration = 0;
-	
-		int numOfFields = 2;
-		if (useFieldInformation)
-			numOfFields = 3;
-	
-		for (int j = 0; j < processTerms; j++) {
-			lexiconStream.readNextEntry();
-			//int[][] tmpArray = new int[numOfFields][lexiconStream.getNt()];
-			numberOfPointersPerIteration += lexiconStream.getNt();
-			//tmpStorage.add(tmpArray);
-			tmpStorage[j] = new int[numOfFields][lexiconStream.getNt()];
-			//the class TIntIntHashMap return zero when you look up for
-			// a the value of a key that does not exist in the hash map.
-			//For this reason, the values that will be inserted in the
-			//hash map are increased by one.
-			codesHashMap.put(lexiconStream.getTermId(), j + 1);
-		}
-		numberOfPointers += numberOfPointersPerIteration;
-		endProcessingLexicon = System.currentTimeMillis();
-		startTraversingDirectFile = System.currentTimeMillis();
-		//scan the direct file
-		//uses indices, tmpStorage and codesHashMap
-		traverseDirectFile(tmpStorage, indices, codesHashMap);
-		//end of traversing the
-		endTraversingDirectFile = System.currentTimeMillis();
-		startWritingInvertedFile = System.currentTimeMillis();
-		//write to the inverted file. We should note that the lexicon
-		//file should be updated as well with the term frequency and
-		//the endOffset and endBitOffset.
-		//file.writeReset();
-		int frequency;
-		int[][] tmpMatrix = null;
-		int[] tmpMatrix0 = null;
-		int[] tmpMatrix1 = null;
-	
-		for (int j = 0; j < processTerms; j++) {
-			frequency = 0; //the term frequency
-			//tmpMatrix = (int[][]) tmpStorage.elementAt(j);
-			tmpMatrix = tmpStorage[j];
-			tmpMatrix0 = tmpMatrix[0];
-			tmpMatrix1 = tmpMatrix[1];
-	
-			//we do not need to sort because the documents are read in
-			//order of docid, and therefore the arrays are already
-			// sorted.
-			if (useFieldInformation) {
-				int[] tmpMatrix2 = tmpMatrix[2];
-				//write the first entry
-				file.writeGamma(tmpMatrix0[0] + 1);
-				frequency += tmpMatrix1[0];
-				file.writeUnary(tmpMatrix1[0]);
-				file.writeBinary(fieldsCount, tmpMatrix2[0]);
-				final int tmpMatrix0Length = tmpMatrix0.length;
-				for (int k = 1; k < tmpMatrix0Length; k++) {
-					file.writeGamma(tmpMatrix0[k] - tmpMatrix0[k - 1]);
-					frequency += tmpMatrix1[k];
-					file.writeUnary(tmpMatrix1[k]);
-					file.writeBinary(fieldsCount, tmpMatrix2[k]);
-				}
-			} else {
-				//write the first entry
-				file.writeGamma(tmpMatrix0[0] + 1);
-				frequency += tmpMatrix1[0];
-				file.writeUnary(tmpMatrix1[0]);
-				final int tmpMatrix0Length = tmpMatrix0.length;
-				for (int k = 1; k < tmpMatrix0Length; k++) {
-					file.writeGamma(tmpMatrix0[k] - tmpMatrix0[k - 1]);
-					frequency += tmpMatrix1[k];
-					file.writeUnary(tmpMatrix1[k]);
-				}
-			}
-	
-			long endOffset = file.getByteOffset();
-			byte endBitOffset = file.getBitOffset();
-			endBitOffset--;
-			if (endBitOffset < 0 && endOffset > 0) {
-				endBitOffset = 7;
-				endOffset--;
-			}
-			numberOfTokens += frequency;
-			dos.writeInt(frequency);
-			dos.writeLong(endOffset);
-			dos.writeByte(endBitOffset);
-		}
-		//file.writeFlush();
-		endWritingInvertedFile = System.currentTimeMillis();
-	
-		System.err.println("time to process part of lexicon: "
-			+ ((endProcessingLexicon - startProcessingLexicon) / 1000D));
-		System.err.println("time to traverse direct file: "
-			+ ((endTraversingDirectFile - startTraversingDirectFile) / 1000D));
-		System.err.println("time to write inverted file: "
-			+ ((endWritingInvertedFile - startWritingInvertedFile) / 1000D));
-		System.err.println("time to perform one iteration: "
-			+ ((endWritingInvertedFile - startProcessingLexicon) / 1000D));
-		System.err.println("number of pointers processed: "
-			+ numberOfPointersPerIteration);
-		
-		indices = null;
-		tmpStorage  = null; 
-		codesHashMap.clear(); 
-		codesHashMap = null;
-	
-	}
-	*/
-	
 	public static void displayMemoryUsage(Runtime r)
 	{
 		if (logger.isDebugEnabled())
@@ -862,26 +705,14 @@
 		);
 	}
 
-	public LexiconInputStream getLexInputStream(String filename)
-	{
-		LexiconInputStream li = null;
-		try{
-			li = (LexiconInputStream) lexiconInputStream.getConstructor(String.class).newInstance(filename);
-		} catch (Exception e) {
-			logger.error("Problem loading a LexiconInputStream", e);
-		}
-		return li;
-	}
 
-	public LexiconOutputStream getLexOutputStream(String filename)
+	@SuppressWarnings("unchecked")
+	protected LexiconOutputStream<String> getLexOutputStream(String structureName) throws IOException
 	{
-		LexiconOutputStream lo = null;
-		try{
-			lo = (LexiconOutputStream) lexiconOutputStream.getConstructor(String.class).newInstance(filename);
-		} catch (Exception e) {
-			logger.error("Problem loading a LexiconOutputStream", e);
-		}
-		return lo;
+		return new MapFileLexiconOutputStream(
+				index.getPath(), index.getPrefix(), 
+				structureName, 
+				(FixedSizeWriteableFactory<Text>)index.getIndexStructure("lexicon-keyfactory"));
 	}
 
 }
Index: src/uk/ac/gla/terrier/structures/indexing/LexiconBuilder.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/indexing/LexiconBuilder.java,v
retrieving revision 1.47
diff -w -u -r1.47 LexiconBuilder.java
--- src/uk/ac/gla/terrier/structures/indexing/LexiconBuilder.java	28 Jan 2009 20:16:58 -0000	1.47
+++ src/uk/ac/gla/terrier/structures/indexing/LexiconBuilder.java	26 Feb 2009 16:11:47 -0000
@@ -25,25 +25,23 @@
  *   Vassilis Plachouras <vassilis{a.}dcs.gla.ac.uk>
  */
 package uk.ac.gla.terrier.structures.indexing;
-import gnu.trove.TIntObjectHashMap;
-
-import java.io.DataOutputStream;
 import java.io.IOException;
-import java.io.ObjectOutputStream;
-import java.io.OutputStream;
 import java.util.Arrays;
-import java.util.HashSet;
+import java.util.Iterator;
 import java.util.LinkedList;
+import java.util.Map;
 import java.util.PriorityQueue;
-import java.util.Set;
 
+import org.apache.hadoop.io.Text;
 import org.apache.log4j.Logger;
 
+import uk.ac.gla.terrier.structures.Closeable;
 import uk.ac.gla.terrier.structures.Index;
-import uk.ac.gla.terrier.structures.Lexicon;
-import uk.ac.gla.terrier.structures.LexiconInputStream;
+import uk.ac.gla.terrier.structures.LexiconEntry;
 import uk.ac.gla.terrier.structures.LexiconOutputStream;
-import uk.ac.gla.terrier.structures.UTFLexiconInputStream;
+import uk.ac.gla.terrier.structures.MapFileLexicon;
+import uk.ac.gla.terrier.structures.MapFileLexiconOutputStream;
+import uk.ac.gla.terrier.structures.seralization.FixedSizeWriteableFactory;
 import uk.ac.gla.terrier.utility.ApplicationSetup;
 import uk.ac.gla.terrier.utility.Files;
 /**
@@ -54,12 +52,13 @@
  */
 public class LexiconBuilder
 {
-	/** class to be used as a lexiconinputstream. set by this and child classes */
-	protected Class lexiconInputStream = null;
+
 	/** class to be used as a lexiconoutpustream. set by this and child classes */
-	protected Class lexiconOutputStream = null;
+	protected Class<? extends LexiconOutputStream> lexiconOutputStream = null;
 
-	protected Class LexiconMapClass = null;
+	protected Class<? extends LexiconMap> LexiconMapClass = null;
+	
+	protected final String lexiconEntryFactoryValueClass;
 	
 	/** The logger used for this class */
 	protected static Logger logger = Logger.getRootLogger();
@@ -75,7 +74,7 @@
 	 * is created.
 	 */
 	protected static final int DocumentsPerLexicon = ApplicationSetup.BUNDLE_SIZE;
-	/** The linkedlist in which the temporary lexicon filenames are stored.
+	/** The linkedlist in which the temporary lexicon structure names are stored.
 	  * These are merged into a single Lexicon by the merge() method. 
 	  * LinkedList is best List implementation for this, as all operations
 	  * are either append element, or remove first element - making LinkedList
@@ -110,40 +109,72 @@
 	/** Number of lexicons to merge at once. Set by property <tt>lexicon.builder.merge.lex.max</tt>, defaults to 16 */
 	protected static final int MAXLEXMERGE = Integer.parseInt(ApplicationSetup.getProperty("lexicon.builder.merge.lex.max", "16"));
 
-	/**
-	 * A default constructor of the class. The lexicon is built in the 
-	 * default path and file: ApplicationSetup.TERRIER_INDEX_PATH and 
-	 * ApplicationSetup.TERRIER_INDEX_PREFIX respectively.
-	 * @deprecated
-	 */
-	public LexiconBuilder()
+	public interface CollectionStaticticsCounter<V> extends Closeable
+	{
+		public void count(V value);
+	}
+	
+	static class BasicLexiconCollectionStaticticsCounter 
+		implements CollectionStaticticsCounter<LexiconEntry>
+	{
+		long numberOfTokens = 0;
+		int numberOfTerms = 0;
+		long numberOfPointers = 0;
+		final Index index;
+		public BasicLexiconCollectionStaticticsCounter(Index _index)
+		{
+			index = _index;
+		}
+		
+		public void count(LexiconEntry value)
+		{
+			numberOfTokens += value.getFrequency();
+			numberOfPointers += value.getDocumentFrequency();
+			numberOfTerms++;
+		}
+		
+		public void close()
+		{
+			if (index != null)
 	{
-		this(ApplicationSetup.TERRIER_INDEX_PATH, ApplicationSetup.TERRIER_INDEX_PREFIX);
+				index.setIndexProperty("num.Terms", ""+numberOfTerms);
+				index.setIndexProperty("num.Tokens", ""+numberOfTokens);
+				index.setIndexProperty("num.Pointers", ""+numberOfPointers);
+			}
+		}
+	}
+	
+	protected String defaultStructureName;
+	protected FixedSizeWriteableFactory<LexiconEntry> valueFactory;
+	
+	
+	public LexiconBuilder(Index i, String _structureName) {
+		this(i, _structureName, 
+				LexiconMap.class, "uk.ac.gla.terrier.structures.BasicLexiconEntry");
 	}
 	
-	public LexiconBuilder(Index i) {
+	@SuppressWarnings("unchecked")
+	protected LexiconBuilder(Index i, String _structureName, 
+			Class <? extends LexiconMap> _LexiconMapClass,
+			String _lexiconEntryClass)
+	{
 		this.index = i;
 		this.indexPath = index.getPath();
 		this.indexPrefix = index.getPrefix();
+		this.defaultStructureName = _structureName;
 		TemporaryLexiconDirectory = indexPath + ApplicationSetup.FILE_SEPARATOR + indexPrefix + "_";
-		LexiconMapClass = LexiconMap.class;	
+		LexiconMapClass = _LexiconMapClass;	
+		lexiconEntryFactoryValueClass = _lexiconEntryClass;
 		try{ TempLex = (LexiconMap) LexiconMapClass.newInstance(); } catch (Exception e) {logger.error(e);}
-		lexiconInputStream = LexiconInputStream.class;
-		lexiconOutputStream = LexiconOutputStream.class;
-	}
 	
-	/** 
-	 * Creates an instance of the class, given the path
-	 * to save the temporary lexicons.
-	 * @param pathname String the path to save the temporary lexicons.
-	 */
-	public LexiconBuilder(String pathname, String prefix) {
-		indexPath = pathname;
-		indexPrefix = prefix;
-		TemporaryLexiconDirectory = pathname + ApplicationSetup.FILE_SEPARATOR + prefix + "_";
-		LexiconMapClass = LexiconMap.class;	
-		try{ TempLex = (LexiconMap) LexiconMapClass.newInstance(); } catch (Exception e) {logger.error(e);}
-		lexiconInputStream = LexiconInputStream.class;
+		this.index.addIndexStructure(
+				defaultStructureName+"-keyfactory", 
+				"uk.ac.gla.terrier.structures.seralization.FixedSizeTextFactory",
+				"java.lang.String",
+				"${max.term.length}"
+				);
+		this.index.addIndexStructure(defaultStructureName+"-valuefactory", lexiconEntryFactoryValueClass+"$Factory", "", "");
+		valueFactory = (FixedSizeWriteableFactory<LexiconEntry>)this.index.getIndexStructure(defaultStructureName+"-valuefactory");
 		lexiconOutputStream = LexiconOutputStream.class;
 	}
 
@@ -155,9 +186,11 @@
 
 	/** If the application code generated lexicons itself, use this method to add them to the merge list 
 	  * Otherwise dont touch this method.
-	  * @param filename Fully path to a lexicon to merge */
-	public void addTemporaryLexicon(String filename) {
-		filename = ApplicationSetup.makeAbsolute(filename, TemporaryLexiconDirectory);
+	  * @param filename Fully path to a lexicon to merge
+	  * @deprecated */
+	public void addTemporaryLexicon(String structureName) {
+		tempLexFiles.addLast(structureName);
+		//filename = ApplicationSetup.makeAbsolute(filename, TemporaryLexiconDirectory);
 	}
 
 	/** Writes the current contents of TempLex temporary lexicon binary tree down to
@@ -172,14 +205,13 @@
 				Files.mkdir(tmpDir);
 				Files.deleteOnExit(tmpDir);//it's fine to mark the temporary *directory* for deletion
 			}
-			String tmpLexName = TemporaryLexiconDirectory + TempLexDirCount + ApplicationSetup.FILE_SEPARATOR + 
-				(TempLexCount) + ApplicationSetup.LEXICONSUFFIX;
-			LexiconOutputStream los = getLexOutputStream(tmpLexName);
+			//String tmpLexName = TemporaryLexiconDirectory + TempLexDirCount + ApplicationSetup.FILE_SEPARATOR + TempLexCount;
+			LexiconOutputStream<String> los = getLexOutputStream(TempLexDirCount+""+TempLexCount);
 			TempLex.storeToStream(los);
 			los.close();
 			/* An alternative but deprecated method to store the temporary lexicons is: 
 			 * TempLex.storeToFile(tmpLexName); */
-			tempLexFiles.addLast(tmpLexName);
+			tempLexFiles.addLast(TempLexDirCount+""+TempLexCount);
 		}catch(IOException ioe){
 			logger.error("Indexing failed to write a lexicon to disk : ", ioe);
 		}		
@@ -224,15 +256,7 @@
 	 * inverted index.
 	 */
 	public void finishedInvertedIndexBuild() {
-		if (Boolean.parseBoolean(ApplicationSetup.getProperty("lexicon.use.hash","true"))) {
-			String lexiconFilename = indexPath + ApplicationSetup.FILE_SEPARATOR + indexPrefix + ApplicationSetup.LEXICONSUFFIX;
-			LexiconInputStream lexStream = getLexInputStream(lexiconFilename);
-			this.createLexiconHash(lexStream);
-		}
-		if (index != null)
-		{
-			index.addIndexStructure("lexicon", "uk.ac.gla.terrier.structure.Lexicon");
-		}
+		LexiconBuilder.optimise(index, defaultStructureName);
 	}
 	
 	/** 
@@ -251,40 +275,23 @@
 		//merges the temporary lexicons
 		if (tempLexFiles.size() > 0)
 		{
-			Set<String> tempDirectories = new HashSet<String>();
-			for(String tmpLex : tempLexFiles)
-			{
-				tempDirectories.add(Files.getParent(tmpLex));
-			}
+			//Set<String> tempDirectories = new HashSet<String>();
+			//for(String tmpLex : tempLexFiles)
+			//{
+			//	tempDirectories.add(Files.getParent(tmpLex));
+			//}
 			try{
 				merge(tempLexFiles);
 				
-			
-				//creates the offsets file
-				final String lexiconFilename = 
-							indexPath + ApplicationSetup.FILE_SEPARATOR + 
-							indexPrefix + ApplicationSetup.LEXICONSUFFIX;
-				LexiconInputStream lis = getLexInputStream(lexiconFilename);
-				createLexiconIndex(
-						lis,
-						lis.numberOfEntries(),
-						Lexicon.lexiconEntryLength
-						); 
-				TermCount = lis.numberOfEntries();
-				if (index != null)
-				{
-					index.addIndexStructure("lexicon", "uk.ac.gla.terrier.structures.Lexicon");
-					index.addIndexStructureInputStream("lexicon", "uk.ac.gla.terrier.structures.LexiconInputStream");
-					index.setIndexProperty("num.Terms", ""+lis.numberOfEntries());
-					index.setIndexProperty("num.Pointers", ""+lis.getNumberOfPointersRead());
-				}
+				//creates the offsets and hash file
+				LexiconBuilder.optimise(index, defaultStructureName);
 			} catch(IOException ioe){
 				logger.error("Indexing failed to merge temporary lexicons to disk : ", ioe);
 			}
-			for (String tmpDir : tempDirectories)
-			{
-				Files.delete(tmpDir);
-			}	
+			//for (String tmpDir : tempDirectories)
+			//{
+			//	Files.delete(tmpDir);
+			//}
 		}	
 		else
 			logger.warn("No temporary lexicons to merge, skipping");
@@ -297,6 +304,7 @@
 	 * @throws IOException an input/output exception is throws 
 	 *		 if a problem is encountered.
 	 */
+	@SuppressWarnings("unchecked")
 	public void merge(LinkedList<String> filesToMerge) throws IOException {
 		//now the merging of the files in the filesToMerge vector 
 		//must take place. 
@@ -318,7 +326,8 @@
 		}
 		if (StartFileCount == 1)
 		{
-			Files.rename(filesToMerge.removeFirst(), indexPath + ApplicationSetup.FILE_SEPARATOR +indexPrefix + ApplicationSetup.LEXICONSUFFIX);
+			MapFileLexicon.renameMapFileLexicon(filesToMerge.removeFirst(), index.getPath(), index.getPrefix(), 
+					defaultStructureName, index.getPath(), index.getPrefix());
 		}
 		else if (MERGE2LEXATTIME)
 		{
@@ -334,23 +343,17 @@
 				
 				//give the proper name to the final merged lexicon
 				if (filesToMerge.size() == 0) 
-					newMergedFile = indexPath + ApplicationSetup.FILE_SEPARATOR + 
-						indexPrefix + ApplicationSetup.LEXICONSUFFIX;
+					newMergedFile = defaultStructureName;
 				else 
-					newMergedFile =
-						Files.getParent(fileToMerge1) 
-							+ ApplicationSetup.FILE_SEPARATOR
-							+ ApplicationSetup.MERGE_PREFIX
-							+ String.valueOf(progressiveNumber++)
-							+ ApplicationSetup.LEXICONSUFFIX;
+					newMergedFile = "tmp_"+defaultStructureName+ String.valueOf(progressiveNumber++);
 	
 				//The opening of the files needs to break into more steps, so that
 				//all the open streams are closed after the completion of the 
 				//operation, and eventually the intermediate files are deleted.
 
-				LexiconInputStream lis1 = getLexInputStream(fileToMerge1);
-				LexiconInputStream lis2 = getLexInputStream(fileToMerge2);
-				LexiconOutputStream los = getLexOutputStream(newMergedFile);
+				Iterator<Map.Entry<String,LexiconEntry>> lis1 = getLexInputStream(fileToMerge1);
+				Iterator<Map.Entry<String,LexiconEntry>> lis2 = getLexInputStream(fileToMerge2);
+				LexiconOutputStream<String> los = getLexOutputStream(newMergedFile);
 	
 				if (logger.isDebugEnabled())
 					logger.debug(
@@ -386,7 +389,7 @@
 				if (logger.isDebugEnabled())
 					 logger.debug("merging "+ numLexicons + " temporary lexicons");
 				final String inputLexiconFileNames[] = new String[numLexicons];
-				final LexiconInputStream[] lis = new LexiconInputStream[numLexicons];
+				final Iterator<Map.Entry<String,LexiconEntry>>[] lis = (Iterator<Map.Entry<String,LexiconEntry>>[])new Iterator[numLexicons];
 
 				for(int i=0;i<numLexicons;i++)
 				{
@@ -397,21 +400,15 @@
 				String newMergedFile = null;
 				//give the proper name to the final merged lexicon
 				if (filesToMerge.size() == 0)
-					newMergedFile = indexPath + ApplicationSetup.FILE_SEPARATOR +
-						indexPrefix + ApplicationSetup.LEXICONSUFFIX;
+					newMergedFile = defaultStructureName;
 				else
-					newMergedFile =
-						Files.getParent(inputLexiconFileNames[0])
-							+ ApplicationSetup.FILE_SEPARATOR
-							+ ApplicationSetup.MERGE_PREFIX
-							+ String.valueOf(progressiveNumber++)
-							+ ApplicationSetup.LEXICONSUFFIX;
+					newMergedFile = String.valueOf(progressiveNumber++);
 
-				final LexiconOutputStream  los = getLexOutputStream(newMergedFile);
+				final LexiconOutputStream<String> los = getLexOutputStream(newMergedFile);
 				mergeNLexicons(lis, los);
-				for(int i=0;i<numLexicons;i++)
+				for(String inputLexiconFileName : inputLexiconFileNames)
 				{
-					Files.delete(inputLexiconFileNames[i]);
+					MapFileLexicon.deleteMapFileLexicon(inputLexiconFileName, index.getPath(), index.getPrefix());
 				}
 				filesToMerge.addLast(newMergedFile);
 			}
@@ -424,7 +421,8 @@
 				logger.debug("begin merging "+ StartFileCount +" temporary lexicons at once...");
 			long startTime = System.currentTimeMillis();
 			final String inputLexiconFileNames[] = new String[StartFileCount];
-			final LexiconInputStream[] lis = new LexiconInputStream[StartFileCount];
+			final Iterator<Map.Entry<String,LexiconEntry>>[] lis = 
+				(Iterator<Map.Entry<String,LexiconEntry>>[]) new Iterator[StartFileCount];
 			
 			for(int i=0;i<StartFileCount;i++)
 			{
@@ -432,8 +430,7 @@
 				lis[i] = getLexInputStream(inputLexiconFileNames[i]);
 				//logger.debug(i+" "+inputLexiconFileNames[i]);
 			}
-			final LexiconOutputStream los = getLexOutputStream( indexPath + ApplicationSetup.FILE_SEPARATOR +
-				indexPrefix + ApplicationSetup.LEXICONSUFFIX);
+			final LexiconOutputStream<String> los = getLexOutputStream(defaultStructureName);
 			mergeNLexicons(lis, los);
 			for(int i=0;i<StartFileCount;i++)
 			{
@@ -443,23 +440,42 @@
 			if (logger.isDebugEnabled())
 				logger.debug("end of merging...("+((endTime-startTime)/1000.0D)+" seconds)");
 		}
+		MapFileLexiconOutputStream.addLexiconToIndex(this.index, defaultStructureName, lexiconEntryFactoryValueClass+"$Factory");
 	}
 	
-	protected void mergeNLexicons(LexiconInputStream[] lis, LexiconOutputStream los) throws IOException
+	protected LexiconEntry newLexiconEntry(int termid)
+	{
+		LexiconEntry rtr = valueFactory.newInstance();
+		rtr.setTermId(termid);
+		return rtr;
+	}
+	
+	@SuppressWarnings("unchecked")
+	protected void mergeNLexicons(Iterator<Map.Entry<String,LexiconEntry>>[] lis, LexiconOutputStream<String> los) throws IOException
 	{
 		final int numLexicons = lis.length;
-		long totalTokens = 0;
-		long totalPointers = 0;
-		int hasMore[] = new int[numLexicons];
-		Arrays.fill(hasMore, -1);
+		boolean hasMore[] = new boolean[numLexicons];
+		Map.Entry<String,LexiconEntry>[] currentEntries = new Map.Entry[numLexicons];
+		
+		Arrays.fill(hasMore, false);
 		PriorityQueue<String> terms = new PriorityQueue<String>(numLexicons);
 		for(int i=0;i<numLexicons;i++)
 		{
-			hasMore[i] = lis[i].readNextEntry();
-			terms.add(lis[i].getTerm());	
+			hasMore[i] = lis[i].hasNext();
+			if (hasMore[i])
+			{
+				currentEntries[i] = lis[i].next();
+				terms.add(currentEntries[i].getKey());
+			}
+			else
+			{
+				currentEntries[i] = null;
+			}
+				
 		}
-		int Tf = 0; int Nt = 0; String targetTerm= null;
+		String targetTerm= null;
 		int targetTermId  = -1;
+		LexiconEntry nextEntryToWrite = null;
 		while(terms.size() > 0)
 		{
 			//what term are we working on
@@ -470,24 +486,28 @@
 			{
 				//does this lexicon contain the term
 				//logger.debug("Checking lexicon "+i+" for "+targetTerm+"="+lis[i].getTerm());
-				if(hasMore[i] != -1 && lis[i].getTerm().equals(targetTerm))
+				if(hasMore[i] && currentEntries[i].getKey().equals(targetTerm))
 				{
 					if (targetTermId == -1)
 					{	//obtain the termid for this term from the first lexicon that has the term
-						targetTermId = lis[i].getTermId();
+						nextEntryToWrite = newLexiconEntry(targetTermId = currentEntries[i].getValue().getTermId());
 					}
-					else if (targetTermId != lis[i].getTermId())
+					else if (targetTermId != currentEntries[i].getValue().getTermId())
 					{	//check the termids match for this term
-						logger.error("Term "+targetTerm+" had two termids ("+targetTermId+","+lis[i].getTermId()+")");
+						logger.error("Term "+targetTerm+" had two termids ("+targetTermId+","+currentEntries[i].getValue().getTermId()+")");
 					}
 					//logger.debug("Term "+targetTerm + " found in "+i + "termid="+ lis[i].getTermId());
-					Tf += lis[i].getTF();
-					Nt += lis[i].getNt();
-					hasMore[i] = lis[i].readNextEntry();
-					if (hasMore[i] != -1)
+					nextEntryToWrite.add(currentEntries[i].getValue());
+					hasMore[i] = lis[i].hasNext();
+					
+					if (hasMore[i])
+					{
+						currentEntries[i] = lis[i].next();
+						terms.add(currentEntries[i].getKey());
+					}
+					else
 					{
-						terms.add(lis[i].getTerm());
-						//break;
+						currentEntries[i] = null;
 					}
 					break;
 				}
@@ -499,19 +519,18 @@
 					logger.error("Term "+ targetTerm + " not found in any lexicons");
 				}
 				//end of this term, so we can write the lexicon entry
-				totalTokens += Tf;
-				totalPointers += Nt;
-				los.writeNextEntry(targetTerm, targetTermId, Nt, Tf, 0, (byte)0);
-				Tf = Nt = 0; targetTermId = -1; targetTerm = null;
+				los.writeNextEntry(targetTerm, nextEntryToWrite);
+				nextEntryToWrite = null; targetTermId = -1; targetTerm = null;
 			}
 		}
-		totalTokens += Tf;
-		totalPointers += Nt;
 		if (targetTermId != -1)
-			los.writeNextEntry(targetTerm, targetTermId, Nt, Tf, 0, (byte)0);
+			los.writeNextEntry(targetTerm, nextEntryToWrite);
 		los.close();
 		for(int i=0;i<numLexicons;i++)
-			lis[i].close();
+		{
+			if (lis[i] instanceof Closeable)
+				((Closeable)lis[i]).close();
+		}
 	}
 		
 
@@ -521,9 +540,9 @@
 	  * @param los Lexion to be merged to
 	  */
 	protected void mergeTwoLexicons(
-			LexiconInputStream lis1,
-			LexiconInputStream lis2,
-			LexiconOutputStream los) throws IOException
+			Iterator<Map.Entry<String,LexiconEntry>> lis1,
+			Iterator<Map.Entry<String,LexiconEntry>> lis2,
+			LexiconOutputStream<String> los) throws IOException
 	{
 
 		//We always take the first two entries of
@@ -538,21 +557,22 @@
 		int termID1 = 0;
 		int termID2 = 0;
 
-		long totalTokens = 0;
-		long totalPointers = 0;
-	
 
-		hasMore1 = (lis1.readNextEntry()!=-1);
-		hasMore2 = (lis2.readNextEntry()!=-1);
+		hasMore1 = lis1.hasNext();
+		hasMore2 = lis2.hasNext();
 		String sTerm1 = null;
 		String sTerm2 = null;
+		Map.Entry<String, LexiconEntry> lee1 = null;
+		Map.Entry<String, LexiconEntry> lee2 = null;
 		if (hasMore1) {
-			termID1 = lis1.getTermId();
-			sTerm1 = lis1.getTerm();
+			lee1 = lis1.next();
+			termID1 = lee1.getValue().getTermId();
+			sTerm1 = lee1.getKey();
 		}
 		if (hasMore2) {
-			termID2 = lis2.getTermId();
-			sTerm2 = lis2.getTerm();
+			lee2 = lis2.next();
+			termID2 = lee2.getValue().getTermId();
+			sTerm2 = lee2.getKey();
 		}
 		while (hasMore1 && hasMore2) {
 			int compareString = 0;
@@ -567,341 +587,137 @@
 			}
 			
 			if (compareString <0) {
-				totalTokens += lis1.getTF();
-				totalPointers += lis1.getNt();
-				los.writeNextEntry(sTerm1, termID1, lis1.getNt(), lis1.getTF(), lis1.getEndOffset(), lis1.getEndBitOffset());
-				hasMore1 = (lis1.readNextEntry()!=-1);
+				los.writeNextEntry(sTerm1, lee1.getValue());
+				hasMore1 = lis1.hasNext();
 				if (hasMore1) {
-					termID1 = lis1.getTermId();
-					sTerm1 = lis1.getTerm();
+					lee1 = lis1.next();
+					termID1 = lee1.getValue().getTermId();
+					sTerm1 = lee1.getKey();
 				}
 			} else if (compareString >0) {
-				totalTokens += lis2.getTF();
-				totalPointers += lis2.getNt();
-				los.writeNextEntry(sTerm2, termID2, lis2.getNt(), lis2.getTF(), lis2.getEndOffset(), lis2.getEndBitOffset());
-				hasMore2 = (lis2.readNextEntry()!=-1);
+				los.writeNextEntry(sTerm2, lee2.getValue());
+				hasMore2 = lis2.hasNext();
 				if (hasMore2) {
-					termID2 = lis2.getTermId();
-					sTerm2 = lis2.getTerm();
+					lee2 = lis2.next();
+					termID2 = lee2.getValue().getTermId();
+					sTerm2 = lee2.getKey();
 				}
 			} else /*if (compareString == 0)*/ {
-				totalTokens += lis1.getTF() + lis2.getTF();
-				totalPointers += lis1.getNt() + lis2.getNt();
+				lee1.getValue().add(lee2.getValue());
 				los.writeNextEntry(
 					sTerm1, 
-					termID1, 
-					lis1.getNt() + lis2.getNt(),
-					lis1.getTF() + lis2.getTF(),  							 
-					0, //inverted index not built yet, so no offsets
-					(byte)0 //inverted index not built yet, so no offsets
+					lee1.getValue()
 				);
-		
-				hasMore1 = (lis1.readNextEntry()!=-1);
-				hasMore2 = (lis2.readNextEntry()!=-1);
+				hasMore1 = lis1.hasNext();
+				hasMore2 = lis2.hasNext();
 				if (hasMore1) {
-					termID1 = lis1.getTermId();
-					sTerm1 = lis1.getTerm();
+					lee1 = lis1.next();
+					termID1 = lee1.getValue().getTermId();
+					sTerm1 = lee1.getKey();
 				}
 				if (hasMore2) {
-					termID2 = lis2.getTermId();
-					sTerm2 = lis2.getTerm();
+					lee2 = lis2.next();
+					termID2 = lee2.getValue().getTermId();
+					sTerm2 = lee2.getKey();
 				}
 			}
 		}
 		if (hasMore1) {
-			lis2.close();
+			if (lis2 instanceof Closeable) {
+				((Closeable)lis2).close();
+			}
 
 			while (hasMore1) {
-				totalTokens += lis1.getTF();
-				totalPointers += lis1.getNt();
-				los.writeNextEntry(sTerm1, termID1, lis1.getNt(), lis1.getTF(), lis1.getEndOffset(), lis1.getEndBitOffset());
-				hasMore1 = (lis1.readNextEntry()!=-1);
+				los.writeNextEntry(sTerm1, lee1.getValue());
+				hasMore1 = lis1.hasNext();
 				if (hasMore1) {
-					termID1 = lis1.getTermId();
-					sTerm1 = lis1.getTerm();
+					lee1 = lis1.next();
+					termID1 = lee1.getValue().getTermId();
+					sTerm1 = lee1.getKey();
 				}
 			}
 
 			//close input file 1 stream
-			lis1.close();
+			if (lis2 instanceof Closeable) {
+				((Closeable)lis2).close();
+			}
 
 		} else if (hasMore2) {
-			lis1.close();
+			if (lis1 instanceof Closeable) {
+				((Closeable)lis1).close();
+			}
 
 			while (hasMore2) {
-				totalTokens += lis2.getTF();
-				totalPointers += lis2.getNt();	
-				los.writeNextEntry(sTerm2, termID2, lis2.getNt(), lis2.getTF(), lis2.getEndOffset(), lis2.getEndBitOffset());
-				hasMore2 = (lis2.readNextEntry()!=-1);
+				los.writeNextEntry(sTerm2, lee2.getValue());
+				hasMore2 = lis2.hasNext();
 				if (hasMore2) {
-					termID2 = lis2.getTermId();
-					sTerm2 = lis2.getTerm();
+					lee2 = lis2.next();
+					termID2 = lee2.getValue().getTermId();
+					sTerm2 = lee2.getKey();
 				}
 			}
 			//close input file 2 stream
-			lis2.close();
+			if (lis2 instanceof Closeable) {
+				((Closeable)lis2).close();
+			}
 		}
 		//close output file streams
 		los.close();
 	}
 	
-	/**
-	 * Creates the lexicon index file that contains a mapping from the 
-	 * given term id to the offset in the lexicon, in order to 
-	 * be able to retrieve the term information according to the 
-	 * term identifier. This is necessary, because the terms in the lexicon 
-	 * file are saved in lexicographical order, and we also want to have 
-	 * fast access based on their term identifier.
-	 * @param lexicon The input stream of the lexicon that we are creating the lexid file for
-	 * @param lexiconEntries The number of entries in this lexicon
-	 * @param lexiconEntrySize The size of one entry in this lexicon
-	 * @exception java.io.IOException Throws an Input/Output exception if 
-	 *			there is an input/output error. 
-	 */
-	public void createLexiconIndex(final LexiconInputStream lexicon, 
-			final int lexiconEntries, 
-			final int lexiconEntrySize) throws IOException {
-		createLexiconIndex(lexicon, lexiconEntries,lexiconEntrySize, indexPath, indexPrefix);
-	}
-	/**
-	 * Creates the lexicon index file that contains a mapping from the
-	 * given term id to the offset in the lexicon, in order to
-	 * be able to retrieve the term information according to the
-	 * term identifier. This is necessary, because the terms in the lexicon
-	 * file are saved in lexicographical order, and we also want to have
-	 * fast access based on their term identifier.
-	 * @param lexicon The input stream of the lexicon that we are creating the lexid file for
-	 * @param lexiconEntries The number of entries in this lexicon
-	 * @param lexiconEntrySize The size of one entry in this lexicon
-	 * @param path The path to the index containing the lexicon
-	 * @param prefix The prefix of the index containing the lexicon
-	 * @exception java.io.IOException Throws an Input/Output exception if
-	 * 	there is an input/output error.
-	 */
-
-	public static void createLexiconIndex(final LexiconInputStream lexicon,
-			final int lexiconEntries, final int lexiconEntrySize,
-			final String path, final String prefix) throws IOException
-	{
-		//save the offsets to a file with the same name as
-		//the lexicon and extension .lexid
-		String lexid = path +
-					ApplicationSetup.FILE_SEPARATOR +
-					prefix +
-					ApplicationSetup.LEXICON_INDEX_SUFFIX;
-		DataOutputStream dosLexid = new DataOutputStream(Files.writeFileStream(lexid));
-		createLexiconIndex(lexicon, lexiconEntries, lexiconEntrySize, dosLexid);
-	}
-	
-	public static void createLexiconIndex(final LexiconInputStream lexicon,
-				final int lexiconEntries, final int lexiconEntrySize,
-				final DataOutputStream dosLexid) throws IOException
-		{
-
-		/*
-		 * This method reads from the lexicon the term ids and stores the
-		 * corresponding offsets in an array. Then this array is written out 
-		 * in order according to the term id.
-		 */
-		long totalPointers = 0;
-		long totalTokens = 0;
-
-
-		//the i-th element of offsets contains the offset in the
-		//lexicon file of the term with term identifier equal to i.
-		long[] offsets = new long[lexiconEntries];
-		int termid = -1;
-		int i=0;
-		try{
-			while (lexicon.readNextEntry()!=-1) {
-		 		termid = lexicon.getTermId();
-				totalPointers += lexicon.getNt();
-				totalTokens += lexicon.getTF();
-				//Debugging: if an exception occurs here, then this infers that the number of entries in the lexicon
-				//has been calculated incorrectly, or that termId > lexiconEntries. termid > lexiconEntries could be
-				//a sign that the lexicon is being decoded incorrecty - eg you're using LexiconInputStream instead of
-				//UTFLexiconInputStream
-				offsets[termid] = (long)i * (long)lexiconEntrySize;
-				i++;
-			}
-		} catch (ArrayIndexOutOfBoundsException aioob) {
-			logger.error("Termid overflow while creating lexid file: NumEntries="+lexiconEntries+ " entrySize="
-				+lexiconEntrySize+ " termid="+termid, aioob);
-		}
-		lexicon.close();
-		//write out the offsets
-		for (i = 0; i < lexiconEntries; i++) {
-			dosLexid.writeLong(offsets[i]);
-		}
-		dosLexid.close();
-	}
 	
 	/** Creates a lexicon index for the specified index
 	  * @param index Index to make the lexicon index for
+	  * @deprecated use optimise instead
 	  */	
 	public static void createLexiconIndex(Index index) throws IOException
 	{
-		final LexiconInputStream lis = (LexiconInputStream)index.getIndexStructureInputStream("lexicon");
-		LexiconBuilder.createLexiconIndex(
-			lis,
-			index.getCollectionStatistics().getNumberOfUniqueTerms(), 
-			lis.getEntrySize(), 
-			index.getPath(),
-			index.getPrefix());
+		optimise(index, "lexicon");
 	}
 
-	/** Create a lexicon hash for the current index
-	  * @param lexStream lexiconinputstream to process
-	  */
-	public void createLexiconHash(final LexiconInputStream lexStream) {
-		LexiconBuilder.createLexiconHash(lexStream, indexPath, indexPrefix);
-	}
 
 	
 	/** Creates a lexicon hash for the specified index
-	 * @param index Index to make the LexiconHash for
+	 * @param index Index to make the LexiconHash the lexicoin
+	 * @deprecated use optimise instead
 	 */
 	public static void createLexiconHash(final Index index) throws IOException
 	{
-		LexiconBuilder.createLexiconHash((LexiconInputStream)index.getIndexStructureInputStream("lexicon"),
-			index.getPath(),index.getPrefix());
-	}
-	
-	/**
-	 * Creates a Lexicon hash. This method reads the lexicon and finds the entries which 
-	 * start with a different letter. The offset of these entries
-	 * is used to speed up the binary search performed during retrieval.
-	 * These offsets are saved to a lex hash file beside the Lexicon in the Index.
-	 * @param lexStream LexiconInputStream to process
-	 * @param path Path to the index containing the lexicon
-	 * @param prefix Prefix of the index containing the lexicon
-	 */
-	public static void createLexiconHash(final LexiconInputStream lexStream, final String path, final String prefix) {
-		String filename = path + ApplicationSetup.FILE_SEPARATOR + prefix + ApplicationSetup.LEXICON_HASH_SUFFIX;
-		try{
-			createLexiconHash(lexStream, Files.writeFileStream(filename));
-		} catch(IOException ioe) {
-			logger.error("IOException while creating hash file in LexiconBuilder.createLexiconHash: " + ioe);
-		}
+		optimise(index, "lexicon");
 	}
 	
-	public static void createLexiconHash(final LexiconInputStream lexStream, OutputStream out)
-	{
-		TIntObjectHashMap map = new TIntObjectHashMap();
-		int previousFirstChar = -1;
-		int firstChar = 0;
-		int counter = -1;
-
-		try {
-			//read all the terms in the lexicon and 
-			//mark the offset of the ones that start
-			//with a different character from the 
-			//previous entry.
-			while (lexStream.readNextEntry()!=-1) {
-				firstChar = lexStream.getTerm().charAt(0);
-				if (firstChar!=previousFirstChar) {
-					int[] boundaries = new int[] {counter, 0};
-					map.put(firstChar, boundaries);
-					previousFirstChar = firstChar;
-				}
-				counter++;
-			}
-			lexStream.close();
-
-	
-			//NB: map should not be too large, say 26+10, more if UTF characters			
-			
-			// after reading all the entries, update the upper 
-			// boundary, which is zero from the previous step.
-			int[] mapKeys = map.keys();
-			Arrays.sort(mapKeys);
-			final int mapKeysSize = mapKeys.length;
-			for (int i=0; i<mapKeysSize-1; i++) {
-				int nextLowerBoundary = ((int[])map.get(mapKeys[i+1]))[0];
-				int[] currentBoundaries = (int[])map.get(mapKeys[i]);
-				currentBoundaries[1] = nextLowerBoundary;
-				map.put(mapKeys[i], currentBoundaries);
-			}
-			//do something about the last entry
-			int nextLowerBoundary = counter;
-			int[] currentBoundaries = (int[])map.get(mapKeys[mapKeysSize-1]);
-			currentBoundaries[1] = nextLowerBoundary;
-			map.put(mapKeys[mapKeysSize-1], currentBoundaries);
-			ObjectOutputStream oos = new ObjectOutputStream(out);
-			oos.writeObject(map);
-			oos.close();
-			//logger.debug("Wrote lexicon hash to "+ filename);	
-		} catch(IOException ioe) {
-			logger.error("IOException while reading the lexicon in LexiconBuilder.createLexiconHash: " + ioe);
-		}
-	}
 
-	public static void main(String args[])
+	/** Optimises the lexicon, eg lexid file */
+	public static void optimise(final Index index, final String structureName)
 	{
-		boolean USE_UTF = Boolean.parseBoolean(ApplicationSetup.getProperty("string.use_utf", "false"));
-		
 		try{
-			if ((args.length == 3||args.length ==4  )&& args[0].equals("--createlexiconindex"))
-			{
-				if (USE_UTF)
-					createLexiconIndex(
-							new UTFLexiconInputStream(args[1], args[2]),
-							args.length == 4
-								? Integer.parseInt(args[3])
-								: Lexicon.numberOfEntries(args[1] + ApplicationSetup.FILE_SEPARATOR + args[2] + ApplicationSetup.LEXICONSUFFIX),
-							Lexicon.lexiconEntryLength,
-							args[1], args[2]);
-				else	
-					createLexiconIndex(
-							new LexiconInputStream(args[1], args[2]),
-							args.length == 4
-							? Integer.parseInt(args[3])
-									: Lexicon.numberOfEntries(args[1] + ApplicationSetup.FILE_SEPARATOR + args[2] + ApplicationSetup.LEXICONSUFFIX),
-									Lexicon.lexiconEntryLength,
-									args[1], args[2]);
-			}
-			else if (args.length == 3 && args[0].equals("--createlexiconhash"))
-			{
-				if (USE_UTF)
-					createLexiconHash( new UTFLexiconInputStream(args[1], args[2]), args[1], args[2]);
-				else
-					createLexiconHash( new LexiconInputStream(args[1], args[2]), args[1], args[2]);
-			}
-			else
-			{
-				logger.fatal("Usage: uk.ac.gla.terrier.indexing.structures.LexiconBuilder {--createlexiconindex|--createlexiconhash} /path/to/index fileprefix [numEntries]");
-				logger.fatal("Exiting ...");
-				System.exit(0);
-			}
+			logger.info("Optimising structure "+structureName);
+			CollectionStaticticsCounter<LexiconEntry> counter = new BasicLexiconCollectionStaticticsCounter(index);
+			MapFileLexicon.optimise(structureName, index, counter);
+			counter.close();
 		} catch (IOException ioe) {
-			logger.error("IOException while building lexicon index : ",ioe);
-			
+			logger.error("IOException while creating optimising lexicon called " + structureName, ioe);
 		}
 	}
 
 
 	/** return the lexicon input stream for the current index at the specified filename */	
-	protected LexiconInputStream getLexInputStream(String filename)
+	@SuppressWarnings("unchecked")
+	protected Iterator<Map.Entry<String,LexiconEntry>> getLexInputStream(String structureName) throws IOException
 	{
-		LexiconInputStream li = null;
-		try{
-			li = (LexiconInputStream) lexiconInputStream.getConstructor(String.class).newInstance(filename);
-		} catch (Exception e) {
-			logger.error("Problem loading a LexiconInputStream", e);
-		}
-		return li;
+		return new MapFileLexicon.MapFileLexiconIterator(structureName, 
+				(FixedSizeWriteableFactory<Text>)index.getIndexStructure(defaultStructureName+"-keyfactory"), 
+				(FixedSizeWriteableFactory<LexiconEntry>)index.getIndexStructure(defaultStructureName+"-valuefactory"));
 	}
 
 	/** return the lexicon outputstream or the current index at the specified filename */
-	protected LexiconOutputStream getLexOutputStream(String filename)
+	@SuppressWarnings("unchecked")
+	protected LexiconOutputStream<String> getLexOutputStream(String structureName) throws IOException
 	{
-		LexiconOutputStream lo = null;
-		try{
-			lo = (LexiconOutputStream) lexiconOutputStream.getConstructor(String.class).newInstance(filename);
-		} catch (Exception e) {
-			logger.error("Problem loading a LexiconOutputStream", e);
-		}
-		return lo;
+		return new MapFileLexiconOutputStream(
+				index.getPath(), index.getPrefix(), 
+				structureName, 
+				(FixedSizeWriteableFactory<Text>)index.getIndexStructure(defaultStructureName+"-keyfactory"));
 	}
 
 }
Index: src/uk/ac/gla/terrier/structures/indexing/LexiconMap.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/indexing/LexiconMap.java,v
retrieving revision 1.6
diff -w -u -r1.6 LexiconMap.java
--- src/uk/ac/gla/terrier/structures/indexing/LexiconMap.java	28 Jan 2009 20:16:58 -0000	1.6
+++ src/uk/ac/gla/terrier/structures/indexing/LexiconMap.java	26 Feb 2009 16:11:47 -0000
@@ -32,6 +32,7 @@
 import java.io.IOException;
 import java.util.Arrays;
 
+import uk.ac.gla.terrier.structures.BasicLexiconEntry;
 import uk.ac.gla.terrier.structures.LexiconOutputStream;
 import uk.ac.gla.terrier.utility.ApplicationSetup;
 import uk.ac.gla.terrier.utility.TermCodes;
@@ -96,14 +97,16 @@
 	  * The binary tree is traversed in order, by called the method
 	  * traverseAndStoreToStream.
 	  * @param lexiconStream The lexicon output stream to store to. */
-	public void storeToStream(LexiconOutputStream lexiconStream) throws IOException {
-		final byte zerob = (byte)0;
-		final long zerol = (long)0;
+	public void storeToStream(LexiconOutputStream<String> lexiconStream) throws IOException
+	{
 		final String[] terms = tfs.keys(new String[0]);
 		Arrays.sort(terms);
+		BasicLexiconEntry le = new BasicLexiconEntry();//TODO could use the one without positions
 		for (String t : terms)
 		{
-			lexiconStream.writeNextEntry(t, TermCodes.getCode(t), nts.get(t), tfs.get(t), zerol, zerob);
+			le.setTermId(TermCodes.getCode(t));
+			le.setStatistics(nts.get(t), tfs.get(t));
+			lexiconStream.writeNextEntry(t, le);
 		}
 	}
 	
Index: src/uk/ac/gla/terrier/structures/indexing/TermEstimateIndex.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/indexing/TermEstimateIndex.java,v
retrieving revision 1.18
diff -w -u -r1.18 TermEstimateIndex.java
--- src/uk/ac/gla/terrier/structures/indexing/TermEstimateIndex.java	28 Jan 2009 20:16:58 -0000	1.18
+++ src/uk/ac/gla/terrier/structures/indexing/TermEstimateIndex.java	26 Feb 2009 16:11:47 -0000
@@ -25,15 +25,17 @@
  */
 package uk.ac.gla.terrier.structures.indexing;
 import java.io.DataInputStream;
-import java.io.File;
 import java.io.IOException;
+import java.util.Iterator;
+import java.util.Map;
 
 import org.apache.log4j.Logger;
 
+import uk.ac.gla.terrier.structures.Closeable;
 import uk.ac.gla.terrier.structures.CollectionStatistics;
 import uk.ac.gla.terrier.structures.Index;
 import uk.ac.gla.terrier.structures.Lexicon;
-import uk.ac.gla.terrier.structures.LexiconInputStream;
+import uk.ac.gla.terrier.structures.LexiconEntry;
 import uk.ac.gla.terrier.utility.ApplicationSetup;
 import uk.ac.gla.terrier.utility.Files;
 /**
@@ -42,20 +44,16 @@
  */
 public class TermEstimateIndex {
 	private static Logger logger = Logger.getRootLogger();
-	protected final Lexicon lex;
+	protected final Lexicon<String> lex;
 	protected final int numTerms;
 
 	/** The array of term estimate for each term. It is sorted by termid. */
 	protected double[] termEstimate;
 	/** The filename of the term estimate index on disk. */
 	protected String INDEX_FILENAME; 
-	/**
-	 * The default constructor.
-	 */
-	public TermEstimateIndex() {
-		this( Index.createIndex() );
-	}
-	public TermEstimateIndex(Index index)
+
+	@SuppressWarnings("unchecked")
+	public TermEstimateIndex(Index index) throws IOException
 	{
 		final String path  = index.getPath();
 		final String prefix = index.getPrefix();
@@ -71,35 +69,30 @@
 
 		//always use a lexiconinputstream, as blocklexicons dont exist past invertedindex creation
 		//but check if we're using UTF
-		final LexiconInputStream lexin = (LexiconInputStream)index.getIndexStructureInputStream("lexicon");
+		final Iterator<Map.Entry<String,LexiconEntry>> lexin = 
+			(Iterator<Map.Entry<String,LexiconEntry>>)index.getIndexStructureInputStream("lexicon");
 
-
-		for (int i = 0; i < termids.length; i++){
-			try{
-				lexin.readNextEntry();
-				termids[i] = lexin.getTermId();
+		int i=0;
+		while(lexin.hasNext())
+		{
+			termids[i++] = lexin.next().getValue().getTermId();
 			}
-			catch(IOException ioe){
-				logger.error("Problem reading lexicon input stream while loading TermEstimateIndex");
+		if (lexin instanceof Closeable)
+			((Closeable)lexin).close();
 				
+		if (Files.exists(INDEX_FILENAME)){
+			logger.error("Could not load TermEstimate index");
+			return;
 			}
-		}
-		lexin.close();
 		
-		if (Files.exists(INDEX_FILENAME)){
-			try{
 				DataInputStream in = new DataInputStream(
 					Files.openFileStream(INDEX_FILENAME));
-				for (int i = 0; i < collectionStatistics.getNumberOfUniqueTerms(); i++){
+		final int termCount = collectionStatistics.getNumberOfUniqueTerms();
+		for (i = 0; i < termCount; i++){
 					this.termEstimate[termids[i]] = in.readDouble();
 				}
 				in.close();
 			}
-			catch(IOException ioe){
-				logger.error("Problem reading TermEstimateIndex at "+INDEX_FILENAME, ioe);
-			}
-		}
-	}
 	/**
 	 * This method prints all the entries in the term estimate index.
 	 *
@@ -110,9 +103,9 @@
 					Files.openFileStream(INDEX_FILENAME));
 			for (int i = 0; i < numTerms; i++){
 				double te = in.readDouble();
-				lex.seekEntry(i);
+				Map.Entry<String,LexiconEntry> lee = lex.getIthLexiconEntry(i);
 				if(logger.isDebugEnabled()){
-					logger.debug(lex.getTerm() + ": " + te);
+					logger.debug(lee.getKey() + ": " + te);
 				}
 			}
 			in.close();
Index: src/uk/ac/gla/terrier/structures/indexing/singlepass/RunsMerger.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/indexing/singlepass/RunsMerger.java,v
retrieving revision 1.6
diff -w -u -r1.6 RunsMerger.java
--- src/uk/ac/gla/terrier/structures/indexing/singlepass/RunsMerger.java	28 Jan 2009 20:16:59 -0000	1.6
+++ src/uk/ac/gla/terrier/structures/indexing/singlepass/RunsMerger.java	26 Feb 2009 16:11:47 -0000
@@ -35,6 +35,9 @@
 
 import uk.ac.gla.terrier.compression.BitOut;
 import uk.ac.gla.terrier.compression.BitOutputStream;
+import uk.ac.gla.terrier.structures.BasicLexiconEntry;
+import uk.ac.gla.terrier.structures.BitFilePosition;
+import uk.ac.gla.terrier.structures.FilePosition;
 import uk.ac.gla.terrier.structures.LexiconOutputStream;
 
 /**
@@ -87,6 +90,8 @@
 	/** Number of pointers written */
 	protected int numberOfPointers = 0;
 
+	protected BitFilePosition startOffset = new FilePosition(0l,(byte)0);
+
 	
 	protected RunIteratorFactory runsSource;
 	
@@ -134,14 +139,16 @@
 	 * @return the byte offset in the BitOut (used for lexicon writting)
 	 */
 	public long getByteOffset(){
-		return bos.getBitOffset() == 0? bos.getByteOffset() - 1: bos.getByteOffset(); 
+		return bos.getByteOffset();
+		//return bos.getBitOffset() == 0? bos.getByteOffset() - 1: bos.getByteOffset(); 
 	}
 	
 	/**
 	 * @return the bit offset in the BitOut (used for lexicon writting)
 	 */
-	public int getBitOffset(){
-		return bos.getBitOffset() == 0? 7: bos.getBitOffset() - 1;
+	public byte getBitOffset(){
+		return bos.getBitOffset();
+		//return bos.getBitOffset() == 0 ? (byte)7 : bos.getBitOffset() - (byte)1;
 	}
 	
 	/**
@@ -207,21 +214,26 @@
 	 * @param lexStream LexiconOutputStream used to write the lexicon.
 	 * @throws IOException if an I/O error occurs.
 	 */
-	public void mergeOne(LexiconOutputStream lexStream) throws Exception{		
+	public void mergeOne(LexiconOutputStream<String> lexStream) throws Exception{		
 		myRun = queue.poll();
 		if(myRun.current().getTerm().equals(lastTermWritten)){
 			// append the term --> keep the data in memory
 			lastDocument = myRun.current().append(bos, lastDocument);
 			lastFreq += myRun.current().getTF();
 			lastDocFreq += myRun.current().getDf();
+			
 		}else{			
-			lexStream.writeNextEntry(lastTermWritten, currentTerm++, lastDocFreq, lastFreq, this.getByteOffset(), (byte)this.getBitOffset());
-			// write the new term
+			//write this term to the lexicon
+			lexStream.writeNextEntry(lastTermWritten, new BasicLexiconEntry(currentTerm++, lastDocFreq, lastFreq, startOffset));
+			//record the start offset of the next term
+			startOffset.setPosition(this.getByteOffset(), this.getBitOffset());
+			//get the information of the next term from the Run
 			numberOfPointers += lastDocFreq;
 			lastDocument = myRun.current().append(bos,-1);
 			lastFreq = myRun.current().getTF();
 			lastDocFreq = myRun.current().getDf();
 			lastTermWritten = myRun.current().getTerm();
+			
 		}
 		if(myRun.hasNext()){
 			myRun.next();
@@ -236,8 +248,9 @@
 	 * @param lexStream LexiconOutputStream used to write the lexicon.
 	 * @throws IOException if an I/O error occurs.	
 	 */	
-	public void endMerge(LexiconOutputStream lexStream) throws IOException{
-		lexStream.writeNextEntry(lastTermWritten, currentTerm++, lastDocFreq, lastFreq, this.getByteOffset(), (byte)this.getBitOffset());		
+	public void endMerge(LexiconOutputStream<String> lexStream) throws IOException{
+		lexStream.writeNextEntry(lastTermWritten, new BasicLexiconEntry(currentTerm++, lastDocFreq, lastFreq, startOffset));
+		//startOffset.setPosition(this.getByteOffset(), this.getBitOffset());
 		numberOfPointers += lastDocFreq;
 		bos.close();
 		myRun.close();
Index: src/uk/ac/gla/terrier/structures/indexing/singlepass/hadoop/HadoopRunsMerger.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/indexing/singlepass/hadoop/HadoopRunsMerger.java,v
retrieving revision 1.4
diff -w -u -r1.4 HadoopRunsMerger.java
--- src/uk/ac/gla/terrier/structures/indexing/singlepass/hadoop/HadoopRunsMerger.java	16 Feb 2009 21:43:03 -0000	1.4
+++ src/uk/ac/gla/terrier/structures/indexing/singlepass/hadoop/HadoopRunsMerger.java	26 Feb 2009 16:11:47 -0000
@@ -30,6 +30,7 @@
 import java.io.IOException;
 import java.util.LinkedList;
 import java.util.ListIterator;
+import uk.ac.gla.terrier.structures.BasicLexiconEntry;
 import org.apache.hadoop.mapred.TaskID;
 import uk.ac.gla.terrier.structures.LexiconOutputStream;
 import uk.ac.gla.terrier.structures.indexing.singlepass.PostingInRun;
@@ -62,9 +63,9 @@
 		mapData = _mapData;
 	}
 
-	public void endMerge(LexiconOutputStream lexStream) {}
+	public void endMerge(LexiconOutputStream<String> lexStream) {}
 	
-	public void mergeOne(LexiconOutputStream lexStream) throws Exception
+	public void mergeOne(LexiconOutputStream<String> lexStream) throws Exception
 	{	
 		int maxDF = 0;
 		RunIterator run = runsSource.createRunIterator(-1);
@@ -74,6 +75,8 @@
 		lastTermWritten = null;
 		lastFreq = 0;
 		lastDocFreq= 0;
+		long startOffset = this.getByteOffset();
+		byte startBitOffset = this.getBitOffset();
 		// for each run in the list 
 		int counter = 0;
 		//for one term: for each set of postings for that term
@@ -83,7 +86,6 @@
 			PostingInRun posting = run.next();
 			lastTermWritten = posting.getTerm();
 			final int reduceNumber = (TaskID.forName(_run.getMapNo()).getId()/partitionSize);
-			
 			//
 			if (posting.getDf() > maxDF) 
 				maxDF = posting.getDf();
@@ -129,7 +131,7 @@
 			lastDocFreq += posting.getDf();
 			counter++;
 		}
-		lexStream.writeNextEntry(lastTermWritten, currentTerm++, lastDocFreq, lastFreq, this.getByteOffset(), (byte)this.getBitOffset());
+		lexStream.writeNextEntry(lastTermWritten, new BasicLexiconEntry(currentTerm++, lastDocFreq, lastFreq, startOffset, startBitOffset));
 		numberOfPointers += lastDocFreq;
 	}
 	
Index: src/uk/ac/gla/terrier/structures/merging/BlockStructureMerger.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/merging/BlockStructureMerger.java,v
retrieving revision 1.25
diff -w -u -r1.25 BlockStructureMerger.java
--- src/uk/ac/gla/terrier/structures/merging/BlockStructureMerger.java	28 Jan 2009 20:17:00 -0000	1.25
+++ src/uk/ac/gla/terrier/structures/merging/BlockStructureMerger.java	26 Feb 2009 16:11:47 -0000
@@ -26,8 +26,8 @@
 package uk.ac.gla.terrier.structures.merging;
 import java.io.IOException;
 import java.util.Date;
+
 import uk.ac.gla.terrier.compression.BitOut;
-import uk.ac.gla.terrier.compression.BitOutputStream;
 import uk.ac.gla.terrier.sorting.SortAscendingQuadrupleVectors;
 import uk.ac.gla.terrier.sorting.SortAscendingQuintupleVectors;
 import uk.ac.gla.terrier.structures.BlockDirectInvertedOutputStream;
@@ -53,22 +53,7 @@
  */
 public class BlockStructureMerger extends StructureMerger {
 	
-	/**
-	 * A constructor that sets the filenames of the inverted
-	 * files to merge
-	 * @param _filename1 the first inverted file to merge
-	 * @param _filename2 the second inverted file to merge
-	 * @deprecated
-	 */
-	public BlockStructureMerger(String _filename1, String _filename2) {
-		super(_filename1, _filename2);
-		directFileOutputStreamClass = BlockDirectInvertedOutputStream.class;
-		directFileInputClass = "uk.ac.gla.terrier.structures.BlockDirectIndex";
-		directFileInputStreamClass = "uk.ac.gla.terrier.structures.BlockDirectIndexInputStream";
-		invertedFileOutputStreamClass = BlockDirectInvertedOutputStream.class;
-		invertedFileInputClass = "uk.ac.gla.terrier.structures.BlockInvertedIndex";
-		invertedFileInputStreamClass = "uk.ac.gla.terrier.structures.BlockInvertedIndexInputStream";
-	}
+	
 	
 	public BlockStructureMerger(Index _srcIndex1, Index _srcIndex2, Index _destIndex)
 	{
@@ -81,15 +66,7 @@
 		invertedFileInputStreamClass = "uk.ac.gla.terrier.structures.BlockInvertedIndexInputStream";
 	}
 
-	/** write Block postings.
-	  * @deprecated Use BlockDirectInvertedOutputStream instead */
-	public static void writeBlockPostings(int[][] postings, int firstId, BitOutputStream output, int binaryBits)
-            throws IOException {
-        if (binaryBits>0)
-            writeFieldPostings(postings, firstId, output, binaryBits);
-        else
-            writeNoFieldPostings(postings, firstId, output);
-    }
+	
 	
 	
 	/**
@@ -268,7 +245,8 @@
 		long start = System.currentTimeMillis();
 		logger.info("started at " + (new Date()));
 		if (ApplicationSetup.getProperty("merger.onlylexicons","false").equals("true")) {
-			sMerger.mergeLexicons();
+			System.err.println("Use LexiconMerger");
+			return;
 		} else if (ApplicationSetup.getProperty("merger.onlydocids","false").equals("true")) {
 			sMerger.mergeDocumentIndexFiles();
 		} else {
@@ -280,104 +258,5 @@
 		logger.info("time elapsed: " + ((end-start)*1.0d/1000.0d) + " sec.");
 	}
 	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	
-	/** write Block postings with fields.
-	* @deprecated Use BlockDirectInvertedOutputStream instead */	
-	public static void  writeFieldPostings(int[][] postings, int firstId, final BitOutputStream output, final int binaryBits)
-	throws IOException {
-
-		//local variables in order to reduce the number
-		//of times we need to access a two-dimensional array
-		final int[] postings0 = postings[0];
-		final int[] postings1 = postings[1];
-		final int[] postings2 = postings[2];
-		final int[] postings3 = postings[3];
-		final int[] postings4 = postings[4];
-		
-		//write the first posting from the term's postings list
-		output.writeGamma(firstId);						//write document id 
-		output.writeUnary(postings1[0]);    			//write frequency
-		output.writeBinary(binaryBits, postings2[0]);	//write fields if binaryBits>0
-		int blockIndex = 0;								//the index of the current block id
-		int blockFrequency = postings3[0];				//the number of block ids to write
-		output.writeUnary(blockFrequency);    			//write block frequency
-		output.writeGamma(postings4[blockIndex]+1);	//write the first block id
-		blockIndex++;									//move to the next block id
-		for (int i=1; i<blockFrequency; i++) {			//write the next blockFrequency-1 ids
-			//write the gap between consequtive block ids
-			output.writeGamma(postings4[blockIndex]-postings4[blockIndex-1]);
-			blockIndex++;
-		}
-		
-		//write the rest of the postings from the term's postings list
-		final int length = postings[0].length;
-		for (int k = 1; k < length; k++) {
-			output.writeGamma(postings0[k] - postings0[k - 1]);	//write gap of document ids
-			output.writeUnary(postings1[k]);					//write term frequency
-			output.writeBinary(binaryBits, postings2[k]);		//write fields if binaryBits>0
-			blockFrequency = postings3[k];						//number of block ids to write
-			output.writeUnary(blockFrequency);					//write block frequency
-			output.writeGamma(postings4[blockIndex]+1);			//write the first block id
-			blockIndex++;										//move to the next block id
-			for (int i=1; i<blockFrequency; i++) {
-				//write the gap between consequtive block ids
-				output.writeGamma(postings4[blockIndex]-postings4[blockIndex-1]);
-				blockIndex++;
-			}
-		}
-	}
-
-	/** write Block postings with fields.
-	* @deprecated Use BlockDirectInvertedOutputStream instead */	
-	public static void writeNoFieldPostings(int[][] postings, int firstId, final BitOutputStream output) 
-		throws IOException {
-		
-		//local variables in order to reduce the number
-		//of times we need to access a two-dimensional array
-		final int[] postings0 = postings[0];
-		final int[] postings1 = postings[1];
-		final int[] postings3 = postings[3];
-		final int[] postings4 = postings[4];
-		
-		//write the first posting from the term's postings list
-		output.writeGamma(firstId);						//write document id 
-		output.writeUnary(postings1[0]);    			//write frequency
-		int blockIndex = 0;								//the index of the current block id
-		int blockFrequency = postings3[0];				//the number of block ids to write
-		output.writeUnary(blockFrequency);    			//write block frequency
-		output.writeGamma(postings4[blockIndex]+1);		//write the first block id
-		blockIndex++;									//move to the next block id
-		for (int i=1; i<blockFrequency; i++) {			//write the next blockFrequency-1 ids
-			//write the gap between consequtive block ids
-			output.writeGamma(postings4[blockIndex]-postings4[blockIndex-1]);
-			blockIndex++;
-		}
-		
-		//write the rest of the postings from the term's postings list
-		final int length = postings0.length;
-		for (int k = 1; k < length; k++) {
-			output.writeGamma(postings0[k] - postings0[k - 1]);	//write gap of document ids
-			output.writeUnary(postings1[k]);					//write term frequency
-			blockFrequency = postings3[k];							//number of block ids to write
-			output.writeUnary(blockFrequency);				//write block frequency
-			output.writeGamma(postings4[blockIndex]+1);		//write the first block id
-			blockIndex++;											//move to the next block id
-			for (int i=1; i<blockFrequency; i++) {
-				//write the gap between consequtive block ids
-				output.writeGamma(postings4[blockIndex]-postings4[blockIndex-1]);
-				blockIndex++;
-			}
-		}		
-	}
 }
 
Index: src/uk/ac/gla/terrier/structures/merging/LexiconMerger.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/merging/LexiconMerger.java,v
retrieving revision 1.8
diff -w -u -r1.8 LexiconMerger.java
--- src/uk/ac/gla/terrier/structures/merging/LexiconMerger.java	28 Jan 2009 20:17:00 -0000	1.8
+++ src/uk/ac/gla/terrier/structures/merging/LexiconMerger.java	26 Feb 2009 16:11:47 -0000
@@ -29,14 +29,19 @@
 
 import java.io.IOException;
 import java.util.Date;
+import java.util.Iterator;
+import java.util.Map;
 
+import org.apache.hadoop.io.Text;
 import org.apache.log4j.Logger;
 
+import uk.ac.gla.terrier.structures.Closeable;
 import uk.ac.gla.terrier.structures.Index;
-import uk.ac.gla.terrier.structures.LexiconInputStream;
+import uk.ac.gla.terrier.structures.LexiconEntry;
 import uk.ac.gla.terrier.structures.LexiconOutputStream;
-import uk.ac.gla.terrier.structures.UTFLexiconOutputStream;
+import uk.ac.gla.terrier.structures.MapFileLexiconOutputStream;
 import uk.ac.gla.terrier.structures.indexing.LexiconBuilder;
+import uk.ac.gla.terrier.structures.seralization.FixedSizeWriteableFactory;
 import uk.ac.gla.terrier.utility.ApplicationSetup;
 
 /**
@@ -74,125 +79,99 @@
 	 * lexicon are not correct. They will be updated only after creating the 
 	 * inverted file.
 	 */
+	@SuppressWarnings("unchecked")
 	public void mergeLexicons() {
 		try {
 			
 			//setting the input streams
-			final LexiconInputStream lexInStream1 = (LexiconInputStream)srcIndex1.getIndexStructureInputStream("lexicon");
-			final LexiconInputStream lexInStream2 = (LexiconInputStream)srcIndex2.getIndexStructureInputStream("lexicon");
+			Iterator<Map.Entry<String,LexiconEntry>> lexInStream1 = 
+				(Iterator<Map.Entry<String,LexiconEntry>>)srcIndex1.getIndexStructureInputStream("lexicon");
+			Iterator<Map.Entry<String,LexiconEntry>> lexInStream2 = 
+				(Iterator<Map.Entry<String,LexiconEntry>>)srcIndex2.getIndexStructureInputStream("lexicon");
+			
+			
+			destIndex.setIndexProperty("lexicon-keyfactory", srcIndex1.getIndexProperty("lexicon-keyfactory", null));
+			destIndex.setIndexProperty("lexicon-valuefactory", srcIndex1.getIndexProperty("lexicon-valuefactory", null));
 			
 		
 			//setting the output stream
-			LexiconOutputStream lexOutStream = UTFIndexing
-				? new UTFLexiconOutputStream(destIndex.getPath(), destIndex.getPrefix())
-				: new LexiconOutputStream(destIndex.getPath(), destIndex.getPrefix());
+			LexiconOutputStream<String> lexOutStream = new MapFileLexiconOutputStream(
+					destIndex.getPath(), destIndex.getPrefix(), 
+					"lexicon", 
+					(FixedSizeWriteableFactory<Text>)destIndex.getIndexStructure("lexicon-keyfactory"));
 			
-			int hasMore1 = -1;
-			int hasMore2 = -1;
+			boolean hasMore1 = false;
+			boolean hasMore2 = false;
 			String term1;
 			String term2;
 
 			int termId = 0;
 		
-			hasMore1 = lexInStream1.readNextEntry();
-			hasMore2 = lexInStream2.readNextEntry();
-			while (hasMore1 >=0 && hasMore2 >= 0) {
-				term1 = lexInStream1.getTerm();
-				term2 = lexInStream2.getTerm();
-				//System.out.println("term1 : " + term1 + "with id " + lexInStream1.getTermId());
-				//System.out.println("term2 : " + term2 + "with id " + lexInStream2.getTermId());
+			hasMore1 = lexInStream1.hasNext();
+			hasMore2 = lexInStream2.hasNext();
+			Map.Entry<String,LexiconEntry> lee1 = null;
+			Map.Entry<String,LexiconEntry> lee2 = null;
+			while (hasMore1 && hasMore2) {
+				lee1 = lexInStream1.next();
+				lee2 = lexInStream2.next();
+				
+				term1 = lee1.getKey();
+				term2 = lee2.getKey();
 				int lexicographicalCompare = term1.compareTo(term2);
 				if (lexicographicalCompare < 0) {
-					
-					lexOutStream.writeNextEntry(term1,
-									   termId,
-									   lexInStream1.getNt(),
-									   lexInStream1.getTF(),
-									   0L,
-									   (byte)0);
+					lee1.getValue().setTermId(termId);
+					lee1.getValue().setPosition(0, (byte)0);
+					lexOutStream.writeNextEntry(term1, lee1.getValue());
 					termId++;
-					hasMore1 = lexInStream1.readNextEntry();
+					hasMore1 = lexInStream1.hasNext();
 				
 				} else if (lexicographicalCompare > 0) {
-					
-					lexOutStream.writeNextEntry(term2,
-									   			termId,
-									   			lexInStream2.getNt(),
-									   			lexInStream2.getTF(),
-									   			0L,
-									   			(byte)0);
+					lee2.getValue().setTermId(termId);
+					lee2.getValue().setPosition(0, (byte)0);
+					lexOutStream.writeNextEntry(term2, lee2.getValue());
 					termId++;
-					hasMore2 = lexInStream2.readNextEntry();
+					hasMore2 = lexInStream2.hasNext();
 				} else {
-					lexOutStream.writeNextEntry(term1,
-												termId,
-												(lexInStream1.getNt() + lexInStream2.getNt()),
-												(lexInStream1.getTF() + lexInStream2.getTF()),
-												0L,
-												(byte)0);
-					hasMore1 = lexInStream1.readNextEntry();
-					hasMore2 = lexInStream2.readNextEntry();
+					lee1.getValue().setTermId(termId);
+					lee1.getValue().setPosition(0, (byte)0);
+					lee1.getValue().add(lee2.getValue());
+					lexOutStream.writeNextEntry(term1, lee1.getValue());
+					hasMore1 = lexInStream1.hasNext();
+					hasMore2 = lexInStream2.hasNext();
 					termId++;
 				}
 			}
 			
-			if (hasMore1 >= 0) {
-				while (hasMore1 >= 0) {
-					lexOutStream.writeNextEntry(lexInStream1.getTerm(),
-									   			termId,
-									   			lexInStream1.getNt(),
-									   			lexInStream1.getTF(),
-									   			0L,
-												(byte)0);
-					hasMore1 = lexInStream1.readNextEntry();
+			if (hasMore1) {
+				while (hasMore1) {
+					lee1.getValue().setTermId(termId);
+					lee1.getValue().setPosition(0, (byte)0);
+					lexOutStream.writeNextEntry(lee1.getKey(), lee1.getValue());
+					hasMore1 = lexInStream1.hasNext();
 					termId++;
 				}
-			} else if (hasMore2 >= 0) {
-				while (hasMore2 >= 0) {
-					lexOutStream.writeNextEntry(lexInStream2.getTerm(),
-												termId,
-												lexInStream2.getNt(),
-												lexInStream2.getTF(),
-												0L,
-												(byte)0);
-					hasMore2 = lexInStream2.readNextEntry();
+			} else if (hasMore2) {
+				while (hasMore2) {
+					lee1.getValue().setTermId(termId);
+					lee1.getValue().setPosition(0, (byte)0);
+					lexOutStream.writeNextEntry(lee2.getKey(), lee2.getValue());
+					hasMore2 = lexInStream2.hasNext();
 					termId++;
 				}		
 			}
 			
-			lexInStream1.close();
-			lexInStream2.close();
-			destIndex.setIndexProperty("num.Pointers", ""+lexOutStream.getNumberOfPointersWritten());
-			destIndex.setIndexProperty("num.Terms", ""+lexOutStream.getNumberOfTermsWritten());
-			destIndex.setIndexProperty("num.Tokens", ""+lexOutStream.getNumberOfTokensWritten());
-			destIndex.addIndexStructure("lexicon", UTFIndexing
-					? "uk.ac.gla.terrier.structures.UTFLexicon"
-					: "uk.ac.gla.terrier.structures.Lexicon");
-			destIndex.addIndexStructureInputStream("lexicon", UTFIndexing
-					? "uk.ac.gla.terrier.structures.UTFLexiconInputStream"
-					: "uk.ac.gla.terrier.structures.LexiconInputStream");
+			if (lexInStream1 instanceof Closeable) {
+				((Closeable)lexInStream1).close();
+			}
+			if (lexInStream2 instanceof Closeable) {
+				((Closeable)lexInStream2).close();
+			}
 			lexOutStream.close();
+			LexiconBuilder.optimise(destIndex, "lexicon");
 			destIndex.flush();
 		} catch(IOException ioe) {
 			logger.error("IOException while merging lexicons.", ioe);
 		}
-		// create an empty lexid file
-		//try{
-		//	BufferedWriter bw = new BufferedWriter(Files.writeFileWriter(
-		//			this.lexiconFileOutput+"id"));
-		//	bw.write(" ");
-		//	bw.close();
-		//}
-		//catch(IOException e){
-		//	e.printStackTrace();
-		//}
-		try{
-			LexiconBuilder.createLexiconIndex(destIndex);
-			if (USE_HASH)
-				LexiconBuilder.createLexiconHash(destIndex);
-		} catch (IOException ioe) {
-			logger.warn("Problems writing lexicon lexid or lexicon hash", ioe);
-		}
 	}
 	public static void main(String[] args) {
 
Index: src/uk/ac/gla/terrier/structures/merging/StructureMerger.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/structures/merging/StructureMerger.java,v
retrieving revision 1.28
diff -w -u -r1.28 StructureMerger.java
--- src/uk/ac/gla/terrier/structures/merging/StructureMerger.java	28 Jan 2009 20:17:00 -0000	1.28
+++ src/uk/ac/gla/terrier/structures/merging/StructureMerger.java	26 Feb 2009 16:11:47 -0000
@@ -30,25 +30,28 @@
 
 import java.io.IOException;
 import java.util.Date;
+import java.util.Iterator;
+import java.util.Map;
+
+import org.apache.log4j.Logger;
 
-import uk.ac.gla.terrier.compression.BitOut;
 import uk.ac.gla.terrier.sorting.SortAscendingPairedVectors;
 import uk.ac.gla.terrier.sorting.SortAscendingTripleVectors;
+import uk.ac.gla.terrier.structures.Closeable;
 import uk.ac.gla.terrier.structures.DirectIndex;
 import uk.ac.gla.terrier.structures.DirectInvertedOutputStream;
 import uk.ac.gla.terrier.structures.DocumentIndexInputStream;
 import uk.ac.gla.terrier.structures.FilePosition;
 import uk.ac.gla.terrier.structures.Index;
 import uk.ac.gla.terrier.structures.InvertedIndex;
-import uk.ac.gla.terrier.structures.LexiconInputStream;
+import uk.ac.gla.terrier.structures.LexiconEntry;
 import uk.ac.gla.terrier.structures.LexiconOutputStream;
-import uk.ac.gla.terrier.structures.UTFLexiconOutputStream;
+import uk.ac.gla.terrier.structures.MapFileLexiconOutputStream;
 import uk.ac.gla.terrier.structures.indexing.DocumentIndexBuilder;
 import uk.ac.gla.terrier.structures.indexing.LexiconBuilder;
+import uk.ac.gla.terrier.structures.seralization.FixedSizeWriteableFactory;
 import uk.ac.gla.terrier.utility.ApplicationSetup;
 
-import org.apache.log4j.Logger;
-
 /**
  * This class merges the structures created by Terrier, so that
  * we use fewer and larger inverted and direct files.
@@ -63,8 +66,6 @@
 public class StructureMerger {
 	/** use UTF supporting lexicon */
 	protected final boolean UTFIndexing = Boolean.parseBoolean(ApplicationSetup.getProperty("string.use_utf", "false"));
-	/** build a lexicon hash */
-	protected boolean USE_HASH = Boolean.parseBoolean(ApplicationSetup.getProperty("lexicon.use.hash","true"));
 	
 	/** the logger used */
 	protected static Logger logger = Logger.getRootLogger();
@@ -128,34 +129,6 @@
 		numberOfTerms = 0;
 	}
 	
-	protected static String[] getIndexPathPrefix(String _IFfilename)
-	{
-		
-		String parts[] = _IFfilename.split(ApplicationSetup.FILE_SEPARATOR);
-		String path = _IFfilename.replaceFirst(parts[parts.length -1]+"$", ""); 
-		String prefix = parts[parts.length -1].replaceAll(ApplicationSetup.IFSUFFIX+"$", "");
-		return new String[]{path,prefix};
-	}
-	
-	/**
-	 * A constructor that sets the filenames of the inverted
-	 * files to merge
-	 * @param _srcfilename1 the first inverted file to merge
-	 * @param _srcfilename2 the second inverted file to merge
-	 * @deprecated
-	 */
-	public StructureMerger(String _srcfilename1, String _srcfilename2) {
-		String[] p1 = getIndexPathPrefix(_srcfilename1);
-		String[] p2 = getIndexPathPrefix(_srcfilename2);
-		srcIndex1 = Index.createIndex(p1[0], p1[1]);
-		srcIndex2 = Index.createIndex(p2[0], p2[1]);
-		
-		//invertedFile1 = _filename1;
-		//invertedFile2 = _filename2;
-		numberOfDocuments = 0;
-		numberOfPointers = 0;
-		numberOfTerms = 0;
-	}
 	
 	/**
 	 * Sets the number of bits to write or read for binary encoded numbers
@@ -165,16 +138,7 @@
 		binaryBits = bits;
 	}
 	
-	/**
-	 * Sets the output filename of the merged inverted file
-	 * @param _outputName the filename of the merged inverted file
-	 * @deprecated
-	 */
-	public void setOutputFilename(String _outputName) {
-		//invertedFileOutput = _outputName;
-		String[] p = getIndexPathPrefix(_outputName);
-		destIndex = Index.createNewIndex(p[0], p[1]);
-	}
+
 	
 	/**
 	 * Sets the output index. This index should have no documents
@@ -191,6 +155,7 @@
 	 * lexicon are ot correct. They will be updated only after creating the 
 	 * inverted file.
 	 */
+	@SuppressWarnings("unchecked")
 	protected void mergeInvertedFiles() {
 		try {
 			//getting the number of entries in the first document index, 
@@ -208,13 +173,24 @@
 				termcodeHashmap = new TIntIntHashMap();
 
 			//setting the input streams
-			LexiconInputStream lexInStream1 = (LexiconInputStream)srcIndex1.getIndexStructureInputStream("lexicon");
-			LexiconInputStream lexInStream2 = (LexiconInputStream)srcIndex2.getIndexStructureInputStream("lexicon");
+			Iterator<Map.Entry<String,LexiconEntry>> lexInStream1 = 
+				(Iterator<Map.Entry<String,LexiconEntry>>)srcIndex1.getIndexStructureInputStream("lexicon");
+			Iterator<Map.Entry<String,LexiconEntry>> lexInStream2 = 
+				(Iterator<Map.Entry<String,LexiconEntry>>)srcIndex2.getIndexStructureInputStream("lexicon");
+			
+			for(String property : new String[] {"index.lexicon-keyfactory.class", "index.lexicon-keyfactory.parameter_values",
+					"index.lexicon-keyfactory.parameter_types", "index.lexicon-valuefactory.class", "index.lexicon-valuefactory.parameter_values",
+					"index.lexicon-valuefactory.parameter_types"} )
+			{
+				destIndex.setIndexProperty(property, srcIndex1.getIndexProperty(property, null));
+			}
 			
-			LexiconOutputStream lexOutStream = UTFIndexing
-				? new UTFLexiconOutputStream(destIndex.getPath(), destIndex.getPrefix())
-				: new LexiconOutputStream(destIndex.getPath(), destIndex.getPrefix());
+			FixedSizeWriteableFactory<LexiconEntry> lvf = 
+				(FixedSizeWriteableFactory<LexiconEntry>)srcIndex1.getIndexStructure("lexicon-valuefactory");
 				
+			//setting the output stream
+			LexiconOutputStream<String> lexOutStream = 
+				new MapFileLexiconOutputStream(destIndex, "lexicon", (Class <FixedSizeWriteableFactory<LexiconEntry>>) lvf.getClass());
 
 			int newCodes = (int)srcIndex1.getCollectionStatistics().getNumberOfUniqueTerms(); 
 			
@@ -227,7 +203,7 @@
 					(DirectInvertedOutputStream)invertedFileOutputStreamClass
 					.getConstructor(String.class,Integer.TYPE)
 					.newInstance(destIndex.getPath() + ApplicationSetup.FILE_SEPARATOR +  
-								destIndex.getPrefix() + ApplicationSetup.IFSUFFIX,
+								destIndex.getPrefix() + ".inverted.bf",
 								binaryBits);
 			} catch (Exception e) {
 				logger.error("Couldn't create specified DirectInvertedOutputStream", e);
@@ -237,162 +213,166 @@
 			//BitOut invertedOutput = new BitOutputStream(
 			//	);
 
-			int hasMore1 = -1;
-			int hasMore2 = -1;
+			boolean hasMore1 = false;
+			boolean hasMore2 = false;
 			String term1;
 			String term2;
+			Map.Entry<String,LexiconEntry> lee1 = null;
+			Map.Entry<String,LexiconEntry> lee2 = null;
+			hasMore1 = lexInStream1.hasNext();
+			if (hasMore1)
+				lee1 = lexInStream1.next();
+			hasMore2 = lexInStream2.hasNext();
+			if (hasMore2)
+				lee2 = lexInStream2.next();
+			while (hasMore1 && hasMore2) {
 		
-			hasMore1 = lexInStream1.readNextEntry();
-			hasMore2 = lexInStream2.readNextEntry();
-			while (hasMore1 >=0 && hasMore2 >= 0) {
-				term1 = lexInStream1.getTerm();
-				term2 = lexInStream2.getTerm();
+				term1 = lee1.getKey();
+				term2 = lee2.getKey();
 
 				int lexicographicalCompare = term1.compareTo(term2);
-				//System.err.println("Comparing "+lexInStream1.getTermId() +":"+ term1 + " with "+lexInStream2.getTermId()+ ":"+ term2 + " results="+lexicographicalCompare);
 				if (lexicographicalCompare < 0) {
 					
 					//write to inverted file as well.
-					int[][] docs = inverted1.getDocuments(lexInStream1.getTermId());
+					
+					int[][] docs = inverted1.getDocuments(lee1.getValue());
+					long startOffset = invOS.getByteOffset();
+					byte startBitOffset = invOS.getBitOffset();
+					
 					invOS.writePostings(docs, docs[0][0]+1);
-					//writePostings(docs, docs[0][0]+1, invertedOutput, binaryBits);
 					numberOfPointers+=docs[0].length;
-					long endOffset = invOS.getByteOffset();
-					byte endBitOffset = invOS.getBitOffset();
-					endBitOffset--;
-					if (endBitOffset < 0 && endOffset > 0) {
-						endBitOffset = 7;
-						endOffset--;
-					}
-					
-					lexOutStream.writeNextEntry(term1,
-									   lexInStream1.getTermId(),
-									   lexInStream1.getNt(),
-									   lexInStream1.getTF(),
-									   endOffset,
-									   endBitOffset);
-					hasMore1 = lexInStream1.readNextEntry();
+//					long endOffset = invOS.getByteOffset();
+//					byte endBitOffset = invOS.getBitOffset();
+//					endBitOffset--;
+//					if (endBitOffset < 0 && endOffset > 0) {
+//						endBitOffset = 7;
+//						endOffset--;
+//					}
+					lee1.getValue().setPosition(startOffset, startBitOffset);
+					lexOutStream.writeNextEntry(term1, lee1.getValue());
+					hasMore1 = lexInStream1.hasNext();
+					if (hasMore1)
+						lee1 = lexInStream1.next();
 				
 				} else if (lexicographicalCompare > 0) {
 					//write to inverted file as well.
-					int[][] docs = inverted2.getDocuments(lexInStream2.getTermId());
+					int[][] docs = inverted2.getDocuments(lee2.getValue());
+					long startOffset = invOS.getByteOffset();
+					byte startBitOffset = invOS.getBitOffset();
 					invOS.writePostings(docs, docs[0][0]+numberOfDocs1+1);
-					//writePostings(docs, docs[0][0]+numberOfDocs1+1, invertedOutput, binaryBits);
-					numberOfPointers+=docs[0].length;
-					long endOffset = invOS.getByteOffset();
-					byte endBitOffset = invOS.getBitOffset();
 					
-					endBitOffset--;
-					if (endBitOffset < 0 && endOffset > 0) {
-						endBitOffset = 7;
-						endOffset--;
-					}
+					numberOfPointers+=docs[0].length;
+//					long endOffset = invOS.getByteOffset();
+//					byte endBitOffset = invOS.getBitOffset();
+//					
+//					endBitOffset--;
+//					if (endBitOffset < 0 && endOffset > 0) {
+//						endBitOffset = 7;
+//						endOffset--;
+//					}
 					
 					int newCode = newCodes++;
 					if (keepTermCodeMap)
-						termcodeHashmap.put(lexInStream2.getTermId(), newCode);
-					
-					lexOutStream.writeNextEntry(term2,
-									   			newCode,
-									   			lexInStream2.getNt(),
-									   			lexInStream2.getTF(),
-									   			endOffset,
-									   			endBitOffset);
-					hasMore2 = lexInStream2.readNextEntry();
+						termcodeHashmap.put(lee2.getValue().getTermId(), newCode);
+					lee2.getValue().setTermId(newCode);
+					lee2.getValue().setPosition(startOffset, startBitOffset);
+					lexOutStream.writeNextEntry(term2, lee2.getValue());
+					hasMore2 = lexInStream2.hasNext();
+					if (hasMore2)
+						lee2 = lexInStream2.next();
 				} else {
 					//write to inverted file as well.
-					int[][] docs1 = inverted1.getDocuments(lexInStream1.getTermId());
-					int[][] docs2 = inverted2.getDocuments(lexInStream2.getTermId());
+					int[][] docs1 = inverted1.getDocuments(lee1.getValue());
+					int[][] docs2 = inverted2.getDocuments(lee2.getValue());
+					long startOffset = invOS.getByteOffset();
+					byte startBitOffset = invOS.getBitOffset();
 					invOS.writePostings(docs1, docs1[0][0]+1);
-					//writePostings(docs1, docs1[0][0]+1, invertedOutput, binaryBits);
 					numberOfPointers+=docs1[0].length;
 					invOS.writePostings(docs2, docs2[0][0] + numberOfDocs1 - docs1[0][docs1[0].length-1]);
-					//writePostings(docs2, docs2[0][0] + numberOfDocs1 - docs1[0][docs1[0].length-1], 
-					//					invertedOutput, binaryBits);
 					numberOfPointers+=docs2[0].length;
-					long endOffset = invOS.getByteOffset();
-					byte endBitOffset = invOS.getBitOffset();
-					endBitOffset--;
-					if (endBitOffset < 0 && endOffset > 0) {
-						endBitOffset = 7;
-						endOffset--;
-					}
+//					long endOffset = invOS.getByteOffset();
+//					byte endBitOffset = invOS.getBitOffset();
+//					endBitOffset--;
+//					if (endBitOffset < 0 && endOffset > 0) {
+//						endBitOffset = 7;
+//						endOffset--;
+//					}
+					
 					
-					int newCode = lexInStream1.getTermId();
+					lee1.getValue().setPosition(startOffset, startBitOffset);
+					int newCode = lee1.getValue().getTermId();
 					if (keepTermCodeMap)
-						termcodeHashmap.put(lexInStream2.getTermId(), newCode);
+						termcodeHashmap.put(lee2.getValue().getTermId(), newCode);
 					
-					lexOutStream.writeNextEntry(term1,
-												newCode,
-												(lexInStream1.getNt() + lexInStream2.getNt()),
-												(lexInStream1.getTF() + lexInStream2.getTF()),
-												endOffset,
-												endBitOffset);
-					hasMore1 = lexInStream1.readNextEntry();
-					hasMore2 = lexInStream2.readNextEntry();
+					lee1.getValue().add(lee2.getValue());
+					lexOutStream.writeNextEntry(term1, lee1.getValue());
+					hasMore1 = lexInStream1.hasNext();
+					if (hasMore1)
+						lee1 = lexInStream1.next();
+					
+					hasMore2 = lexInStream2.hasNext();
+					if (hasMore2)
+						lee2 = lexInStream2.next();
 				}
 			}
 			
-			if (hasMore1 >= 0) {
-				while (hasMore1 >= 0) {
-					
+			if (hasMore1) {
+				while (hasMore1) {
+					lee1 = lexInStream1.next();
 					//write to inverted file as well.
-					int[][] docs = inverted1.getDocuments(lexInStream1.getTermId());
+					int[][] docs = inverted1.getDocuments(lee1.getValue());
+					long startOffset = invOS.getByteOffset();
+					byte startBitOffset = invOS.getBitOffset();
 					invOS.writePostings(docs, docs[0][0]+1);
-					//writePostings(docs, docs[0][0]+1, invertedOutput, binaryBits);
 					numberOfPointers+=docs[0].length;
-					long endOffset = invOS.getByteOffset();
-					byte endBitOffset = invOS.getBitOffset();
-					//long endOffset = invertedOutput.getByteOffset();
-					//byte endBitOffset = invertedOutput.getBitOffset();
-					endBitOffset--;
-					if (endBitOffset < 0 && endOffset > 0) {
-						endBitOffset = 7;
-						endOffset--;
-					}
-					
-					lexOutStream.writeNextEntry(lexInStream1.getTerm(),
-									   			lexInStream1.getTermId(),
-									   			lexInStream1.getNt(),
-									   			lexInStream1.getTF(),
-									   			endOffset,
-												endBitOffset);
-					hasMore1 = lexInStream1.readNextEntry();
+//					long endOffset = invOS.getByteOffset();
+//					byte endBitOffset = invOS.getBitOffset();
+//					endBitOffset--;
+//					if (endBitOffset < 0 && endOffset > 0) {
+//						endBitOffset = 7;
+//						endOffset--;
+//					}
+					lee1.getValue().setPosition(startOffset, startBitOffset);
+					lexOutStream.writeNextEntry(lee1.getKey(), lee1.getValue());
+					hasMore1 = lexInStream1.hasNext();
+					if (hasMore1)
+						lee1 = lexInStream1.next();
 				}
-			} else if (hasMore2 >= 0) {
-				while (hasMore2 >= 0) {
+			} else if (hasMore2) {
+				while (hasMore2) {
 					//write to inverted file as well.
-					int[][] docs = inverted2.getDocuments(lexInStream2.getTermId());
+					int[][] docs = inverted2.getDocuments(lee2.getValue());
+					long startOffset = invOS.getByteOffset();
+					byte startBitOffset = invOS.getBitOffset();
 					invOS.writePostings(docs, docs[0][0]+numberOfDocs1+1);
-					//writePostings(docs, docs[0][0]+numberOfDocs1+1, invertedOutput, binaryBits);
-					numberOfPointers+=docs[0].length;
-					long endOffset = invOS.getByteOffset();
-					byte endBitOffset = invOS.getBitOffset();
 					
-					//long endOffset = invertedOutput.getByteOffset();
-					//byte endBitOffset = invertedOutput.getBitOffset();
-					endBitOffset--;
-					if (endBitOffset < 0 && endOffset > 0) {
-						endBitOffset = 7;
-						endOffset--;
-					}
+					numberOfPointers+=docs[0].length;
+//					long endOffset = invOS.getByteOffset();
+//					byte endBitOffset = invOS.getBitOffset();
+//					endBitOffset--;
+//					if (endBitOffset < 0 && endOffset > 0) {
+//						endBitOffset = 7;
+//						endOffset--;
+//					}
 					
 					int newCode = newCodes++;
 					if (keepTermCodeMap)
-						termcodeHashmap.put(lexInStream2.getTermId(), newCode);
-						
-					lexOutStream.writeNextEntry(lexInStream2.getTerm(),
-												newCode,
-												lexInStream2.getNt(),
-												lexInStream2.getTF(),
-												endOffset,
-												endBitOffset);
-					hasMore2 = lexInStream2.readNextEntry();		
+						termcodeHashmap.put(lee2.getValue().getTermId(), newCode);
+					lee2.getValue().setTermId(newCode);
+					lee2.getValue().setPosition(startOffset, startBitOffset);
+					lexOutStream.writeNextEntry(lee2.getKey(), lee2.getValue());
+					hasMore2 = lexInStream2.hasNext();
+					if (hasMore2)
+						lee2 = lexInStream2.next();
 				}		
 			}
 			
-			lexInStream1.close();
-			lexInStream2.close();
+			if (lexInStream1 instanceof Closeable) {
+				((Closeable)lexInStream1).close();
+			}
+			if (lexInStream2 instanceof Closeable) {
+				((Closeable)lexInStream2).close();
+			}
 			
 
 			inverted1.close();
@@ -400,25 +380,16 @@
 			invOS.close();
 			
 			destIndex.setIndexProperty("num.Documents", ""+numberOfDocuments);
-			destIndex.setIndexProperty("num.Pointers", ""+lexOutStream.getNumberOfPointersWritten());
-			destIndex.setIndexProperty("num.Terms", ""+lexOutStream.getNumberOfTermsWritten());
-			destIndex.setIndexProperty("num.Tokens", ""+lexOutStream.getNumberOfTokensWritten());
-			destIndex.addIndexStructure("lexicon", UTFIndexing 
-					? "uk.ac.gla.terrier.structures.UTFLexicon" 
-					: "uk.ac.gla.terrier.structures.Lexicon");
-			destIndex.addIndexStructureInputStream("lexicon", UTFIndexing 
-					? "uk.ac.gla.terrier.structures.UTFLexiconInputStream" 
-					: "uk.ac.gla.terrier.structures.LexiconInputStream");
 			destIndex.addIndexStructure(
 					"inverted", 
 					invertedFileInputClass, 
-					"uk.ac.gla.terrier.structures.Lexicon,java.lang.String,java.lang.String", 
-					"lexicon,path,prefix");
+					"uk.ac.gla.terrier.structures.Index,java.lang.String", 
+					"index,structureName");
 			destIndex.addIndexStructureInputStream(
                     "inverted",
                     invertedFileInputStreamClass,
-                    "java.lang.String,java.lang.String,uk.ac.gla.terrier.structures.LexiconInputStream",
-                    "path,prefix,lexicon-inputstream");
+                    "uk.ac.gla.terrier.structures.Index,java.lang.String,java.util.Iterator",
+                    "index,structureName,lexicon-inputstream");
 			lexOutStream.close();
 			destIndex.flush();
 								
@@ -427,129 +398,6 @@
 		}
 	}
 
-		/**
-	 * Merges the two lexicons into one. After this stage, the offsets in the
-	 * lexicon are not correct. 
-	 */
-	protected void mergeLexicons() {
-		try {
-			//getting the number of entries in the first document index, 
-			//in order to assign the correct docids to the documents 
-			//of the second inverted file.			
-			
-			//creating a new map between new and old term codes
-			if (keepTermCodeMap)
-				termcodeHashmap = new TIntIntHashMap();
-			
-			//setting the input streams
-			final LexiconInputStream lexInStream1 = (LexiconInputStream)srcIndex1.getIndexStructureInputStream("lexicon");
-			final LexiconInputStream lexInStream2 = (LexiconInputStream)srcIndex2.getIndexStructureInputStream("lexicon");
-			
-			final LexiconOutputStream lexOutStream = UTFIndexing
-				? new UTFLexiconOutputStream(destIndex.getPath(), destIndex.getPrefix())
-				: new LexiconOutputStream(destIndex.getPath(), destIndex.getPrefix());
-				
-
-			int newCodes = (int)srcIndex1.getCollectionStatistics().getNumberOfUniqueTerms(); 
-
-			int hasMore1 = -1;
-			int hasMore2 = -1;
-			String term1;
-			String term2;
-		
-			hasMore1 = lexInStream1.readNextEntry();
-			hasMore2 = lexInStream2.readNextEntry();
-			while (hasMore1 >=0 && hasMore2 >= 0) {
-				term1 = lexInStream1.getTerm();
-				term2 = lexInStream2.getTerm();
-
-				int lexicographicalCompare = term1.compareTo(term2);
-				if (lexicographicalCompare < 0) {
-					
-					lexOutStream.writeNextEntry(term1,
-									   lexInStream1.getTermId(),
-									   lexInStream1.getNt(),
-									   lexInStream1.getTF(),
-									   0L,
-									   (byte)0);
-					hasMore1 = lexInStream1.readNextEntry();
-				
-				} else if (lexicographicalCompare > 0) {
-					int newCode = newCodes++;
-					if (keepTermCodeMap)
-						termcodeHashmap.put(lexInStream2.getTermId(), newCode);
-					
-					lexOutStream.writeNextEntry(term2,
-									   			newCode,
-									   			lexInStream2.getNt(),
-									   			lexInStream2.getTF(),
-									   			0L,
-									   			(byte)0);
-					hasMore2 = lexInStream2.readNextEntry();
-				} else {
-					int newCode = lexInStream1.getTermId();
-					if (keepTermCodeMap)
-						termcodeHashmap.put(lexInStream2.getTermId(), newCode);
-					
-					lexOutStream.writeNextEntry(term1,
-												newCode,
-												(lexInStream1.getNt() + lexInStream2.getNt()),
-												(lexInStream1.getTF() + lexInStream2.getTF()),
-												0L,
-												(byte)0);
-					hasMore1 = lexInStream1.readNextEntry();
-					hasMore2 = lexInStream2.readNextEntry();
-				}
-			}
-			
-			if (hasMore1 >= 0) {
-				while (hasMore1 >= 0) {
-									
-					lexOutStream.writeNextEntry(lexInStream1.getTerm(),
-									   			lexInStream1.getTermId(),
-									   			lexInStream1.getNt(),
-									   			lexInStream1.getTF(),
-									   			0L,
-												(byte)0);
-					hasMore1 = lexInStream1.readNextEntry();
-				}
-			} else if (hasMore2 >= 0) {
-				while (hasMore2 >= 0) {
-					int newCode = newCodes++;
-					if  (keepTermCodeMap)
-						termcodeHashmap.put(lexInStream2.getTermId(), newCode);
-						
-					lexOutStream.writeNextEntry(lexInStream2.getTerm(),
-												newCode,
-												lexInStream2.getNt(),
-												lexInStream2.getTF(),
-												0L,
-												(byte)0);
-					hasMore2 = lexInStream2.readNextEntry();		
-				}		
-			}
-			
-			lexInStream1.close();
-			lexInStream2.close();
-			
-			
-			destIndex.setIndexProperty("num.Documents", ""+numberOfDocuments);
-			destIndex.setIndexProperty("num.Pointers", ""+lexOutStream.getNumberOfPointersWritten());
-			destIndex.setIndexProperty("num.Terms", ""+lexOutStream.getNumberOfTermsWritten());
-			destIndex.setIndexProperty("num.Tokens", ""+lexOutStream.getNumberOfTokensWritten());
-			destIndex.addIndexStructure("lexicon", UTFIndexing 
-					? "uk.ac.gla.terrier.structures.UTFLexicon" 
-					: "uk.ac.gla.terrier.structures.Lexicon");
-			destIndex.addIndexStructureInputStream("lexicon", UTFIndexing 
-					? "uk.ac.gla.terrier.structures.UTFLexiconInputStream" 
-					: "uk.ac.gla.terrier.structures.LexiconInputStream");
-			lexOutStream.close();
-			destIndex.flush();
-
-		} catch(IOException ioe) {
-			logger.error("IOException while merging lexicons.", ioe);
-		}
-	}
 
 	/**
 	 * Merges the two direct files and the corresponding document id files.
@@ -710,17 +558,7 @@
 	 * creates the final term code to offset file, and the lexicon hash if enabled.
 	 */
 	protected void createLexidFile() {
-		try {
-			LexiconBuilder.createLexiconIndex(destIndex);
-		} catch(IOException ioe) {
-			logger.error("IOException while creating lexid file.", ioe);
-		}
-		if (USE_HASH)
-			try{
-				LexiconBuilder.createLexiconHash(destIndex);
-			} catch (IOException ioe) {
-				logger.error("IOException while creating lexicon hash file", ioe);
-			}
+		LexiconBuilder.optimise(destIndex, "lexicon");
 	}
 	
 	/**
@@ -743,7 +581,7 @@
 		}
 		else if (bothLexicon)
 		{
-			mergeLexicons();
+			new LexiconMerger(srcIndex1, srcIndex2, destIndex).mergeLexicons();
 			t2 = System.currentTimeMillis();
     	    logger.info("merged lexicons in " + ((t2-t1)/1000.0d));
 		}
@@ -804,7 +642,8 @@
 		long start = System.currentTimeMillis();
 		logger.info("started at " + (new Date()));
 		if (ApplicationSetup.getProperty("merger.onlylexicons","false").equals("true")) {
-			sMerger.mergeLexicons();
+			System.err.println("Use LexiconMerger");
+			return;
 		} else if (ApplicationSetup.getProperty("merger.onlydocids","false").equals("true")) {
 			sMerger.mergeDocumentIndexFiles();
 		} else {
@@ -819,84 +658,7 @@
 		logger.info("time elapsed: " + ((end-start)*1.0d/1000.0d) + " sec.");
 	}
 
-	/**
-	 * Writes the given postings to a bit file. Depending on 
-	 * the value of the field binaryBits, this method will call the 
-	 * appropriate method writeToInvertedFileFields, or
-	 * writeToInvertedFileNoFields.
-	 * @param postings the postings list to write.
-	 * @param firstId the first identifier to write. This can be 
-	 *        an id plus one, or the gap of the current id and the previous one.
-	 * @param output the output bit file.
-	 * @deprecated Please use DirectInvertedOutputStream instead
-	 */
-	public static void writePostings(int[][] postings, int firstId, BitOut output, int binaryBits)
-			throws IOException {
-		if (binaryBits>0) 
-			writeFieldPostings(postings, firstId, output, binaryBits);
-		else 
-			writeNoFieldPostings(postings, firstId, output);
-	}
-	
-	/**
-	 * Writes the given postings to a bit file. This method assumes that
-	 * field information is available as well.
-	 * @param postings the postings list to write.
-	 * @param firstId the first identifier to write. This can be 
-	 *        an id plus one, or the gap of the current id and the previous one.
-	 * @param output the output bit file.
-	 * @deprecated use DirectInvertedIndexOutputStream
-	 */
-	public static void  writeFieldPostings(int[][] postings, int firstId, BitOut output, int binaryBits) 
-			throws IOException {
-		
-		//local variables in order to reduce the number
-		//of times we need to access a two-dimensional array
-		final int[] postings0 = postings[0];
-		final int[] postings1 = postings[1];
-		final int[] postings2 = postings[2];
-		
-		//write the first entry
-		output.writeGamma(firstId);
-		output.writeUnary(postings1[0]);
-		output.writeBinary(binaryBits, postings2[0]);
-	
-		final int length = postings0.length;
-		for (int k = 1; k < length; k++) {
-			output.writeGamma(postings0[k] - postings0[k - 1]);
-			output.writeUnary(postings1[k]);
-			output.writeBinary(binaryBits, postings2[k]);
-		}
-	}
-	
-	/**
-	 * Writes the given postings to a bit file. This method assumes that
-	 * field information is not available.
-	 * @param postings the postings list to write.
-	 * @param firstId the first identifier to write. This can be 
-	 *        an id plus one, or the gap of the current id and the previous one.
-	 * @param output the output bit file.
-	 * @throws IOException if an error occurs during writing to a file.
-	 * @deprecated use DirectInvertedIndexOutputStream
-	 */
-	public static void writeNoFieldPostings(int[][] postings, int firstId, BitOut output) 
-			throws IOException {
 
-		//local variables in order to reduce the number
-		//of times we need to access a two-dimensional array
-		final int[] postings0 = postings[0];
-		final int[] postings1 = postings[1];
-		
-		//write the first entry
-		output.writeGamma(firstId);
-		output.writeUnary(postings1[0]);
-	
-		final int length = postings[0].length;
-		for (int k = 1; k < length; k++) {
-			output.writeGamma(postings0[k] - postings0[k - 1]);
-			output.writeUnary(postings1[k]);
-		}
-	}
 
 
 }
Index: src/uk/ac/gla/terrier/utility/ApplicationSetup.java
===================================================================
RCS file: /usr/local/cvs/javair/terrier/src/uk/ac/gla/terrier/utility/ApplicationSetup.java,v
retrieving revision 1.71
diff -w -u -r1.71 ApplicationSetup.java
--- src/uk/ac/gla/terrier/utility/ApplicationSetup.java	28 Jan 2009 20:17:02 -0000	1.71
+++ src/uk/ac/gla/terrier/utility/ApplicationSetup.java	26 Feb 2009 16:11:47 -0000
@@ -230,7 +230,7 @@
 	 * property is <tt>if.suffix</tt> and by default
 	 * the value of this property is <tt>.if</tt>
 	 */
-	public static String IFSUFFIX;
+	//public static String IFSUFFIX;
 	
 	/**
 	 * The suffix of the file that contains the
@@ -238,7 +238,7 @@
 	 * <tt>lexicon.suffix</tt> and by default 
 	 * the value of this property is <tt>.lex</tt>
 	 */
-	public static String LEXICONSUFFIX;
+	//public static String LEXICONSUFFIX;
 	
 	/**
 	 * The suffix of the file that contains the
@@ -255,11 +255,11 @@
 	 * property is <tt>lexicon.index.suffix</tt> and
 	 * by default its value is .lexid.
 	 */
-	public static String LEXICON_INDEX_SUFFIX;
+	//public static String LEXICON_INDEX_SUFFIX;
 
 	/** The suffix of the lexicon hash file. Corresponding property
      * is <tt>lexicon.hash.suffix</tt>, default is ".lexhash". */
-	public static String LEXICON_HASH_SUFFIX;
+	//public static String LEXICON_HASH_SUFFIX;
 
 	
 	/**
@@ -346,16 +346,16 @@
 	public static String TERRIER_INDEX_PREFIX;
 	
 	/** The filename of the inverted file.*/
-	public static String INVERTED_FILENAME;
+	//public static String INVERTED_FILENAME;
 	/** The filename of the direct file.*/
 	public static String DIRECT_FILENAME;
 	/** The filename of the document index.*/
 	public static String DOCUMENT_INDEX_FILENAME;
 	/** The filename of the lexicon file.*/
-	public static String LEXICON_FILENAME;
+	//public static String LEXICON_FILENAME;
 	
 	/** The filename of the lexicon index file.*/
-	public static String LEXICON_INDEX_FILENAME;
+	//public static String LEXICON_INDEX_FILENAME;
 	/** The filename of the log (statistics) file.*/
 	public static String LOG_FILENAME;
 	
@@ -532,10 +532,10 @@
 		//The following properties specify the filenames and suffixes
 		COLLECTION_SPEC = makeAbsolute(getProperty("collection.spec", "collection.spec"), TERRIER_ETC);
 	
-		IFSUFFIX = getProperty("if.suffix", ".if");
-		LEXICONSUFFIX = getProperty("lexicon.suffix", ".lex");
-		LEXICON_INDEX_SUFFIX = getProperty("lexicon.index.suffix", ".lexid");
-		LEXICON_HASH_SUFFIX = getProperty("lexicon.hash.suffix",".lexhash");
+		//IFSUFFIX = getProperty("if.suffix", ".if");
+		//LEXICONSUFFIX = getProperty("lexicon.suffix", ".lex");
+		//LEXICON_INDEX_SUFFIX = getProperty("lexicon.index.suffix", ".lexid");
+		//LEXICON_HASH_SUFFIX = getProperty("lexicon.hash.suffix",".lexhash");
 		DOC_INDEX_SUFFIX = getProperty("doc.index.suffix", ".docid");
 		LOG_SUFFIX = getProperty("log.suffix", ".log");
 		DF_SUFFIX = getProperty("df.suffix", ".df");
@@ -693,11 +693,11 @@
 	 */
 	public static void setupFilenames() {
 		String filenameTemplate = TERRIER_INDEX_PATH + FILE_SEPARATOR + TERRIER_INDEX_PREFIX;
-		INVERTED_FILENAME =filenameTemplate + IFSUFFIX;
+		//INVERTED_FILENAME =filenameTemplate + IFSUFFIX;
 		DIRECT_FILENAME = filenameTemplate + DF_SUFFIX;
 		DOCUMENT_INDEX_FILENAME = filenameTemplate + DOC_INDEX_SUFFIX;
-		LEXICON_FILENAME = filenameTemplate + LEXICONSUFFIX;
-		LEXICON_INDEX_FILENAME = filenameTemplate + LEXICON_INDEX_SUFFIX;
+		//LEXICON_FILENAME = filenameTemplate + LEXICONSUFFIX;
+		//LEXICON_INDEX_FILENAME = filenameTemplate + LEXICON_INDEX_SUFFIX;
 		LOG_FILENAME = filenameTemplate + LOG_SUFFIX;
 	}
 

