Index: src/uk/ac/gla/terrier/utility/FixedSizeReader.java
===================================================================
--- src/uk/ac/gla/terrier/utility/FixedSizeReader.java	(revision 0)
+++ src/uk/ac/gla/terrier/utility/FixedSizeReader.java	(revision 0)
@@ -0,0 +1,116 @@
+/*
+ * Terrier - Terabyte Retriever 
+ * Webpage: http://ir.dcs.gla.ac.uk/terrier 
+ * Contact: terrier{a.}dcs.gla.ac.uk
+ * University of Glasgow - Department of Computing Science
+ * http://www.gla.ac.uk/
+ * 
+ * The contents of this file are subject to the Mozilla Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+ * the License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * The Original Code is FixedSizeReader.java
+ *
+ * The Original Code is Copyright (C) 2004-2009 the University of Glasgow.
+ * All Rights Reserved.
+ *
+ * Contributor(s):
+ *   Craig Macdonald (Original author)
+ */
+package uk.ac.gla.terrier.utility;
+
+import java.io.IOException;
+import java.io.FilterReader;
+import java.io.Reader;
+
+/** Reader filter class that stops reading after a given number of characters.
+  * Beware when using this class that while this class will never use more characters than
+  * permitted from the underlying Reader, that underlying Readers may buffer internally, causing
+  * unused characters not to be returned to the underlying InputStream.
+  * @author Craig Macdonald */
+public class FixedSizeReader extends FilterReader
+{
+	/** maximum characters to read */
+	protected final long maxsize;
+	/** number of characters read so far */
+	protected long size;
+	/** create a new FixedSizeReader, using in as the underlying
+	  * Reader, and maxsize as the maximum number of characters to read. 
+	  * @param in underlying Reader to read characters from.
+	  * @param maxsize maximum number of bytes to read.
+	  */
+	public FixedSizeReader(Reader in, long maxsize)
+	{
+		super(in);
+		this.maxsize = maxsize;
+	}
+
+	/** Read a single character from the underlying Reader.
+	  * @return The character read, as an integer in the range 0 to 65535 (0x00-0xffff), or -1 if the end of the underlying stream has been reached
+	  * or the maximum allowed number of characters has been read from it.
+	  * @throws IOException If an I/O error occurs.
+	  */
+	public int read() throws IOException
+	{
+		if (size == maxsize)
+			return -1;
+		final int ch = in.read();
+		if (ch != -1)
+			size++;
+		return ch;
+	}
+	
+	/** Read characters into a portion of an array. 
+	  * @param cbuf Destination buffer
+	  * @param off Offset at which to start storing characters
+	  * @param len  Maximum number of characters to read 
+	  * @return The number of characters read, or -1 if the end of the stream has been reached.
+	  * @throws IOException If an I/O error occurs in the underlying reader.
+	  */
+	public int read(char[] cbuf, int off, int len) throws IOException
+	{
+		if (size == maxsize)
+            return -1;
+		if (size + len < maxsize)
+		{
+			int rtr = in.read(cbuf, off, len);
+			size += rtr;
+			return rtr;
+		}
+		int rtr = in.read(cbuf, off, (int)(maxsize - size));
+		size += rtr;
+		return rtr;	
+	}
+
+    /**
+     * Skips n characters from the stream. If the end of
+     * the stream has been reached before reading n characters,
+     * then it returns.
+     * <B>NB:</B> This method uses read() internally.
+     * @param n long the number of characters to skip.
+     * @return long the number of characters skipped.
+     * @throws IOException if there is any error while
+     *       reading from the stream.
+     */
+    public long skip(long n) throws IOException {
+        /* TODO a more efficient implementation could be made */
+        long i = 0;
+        for (; i < n && size < maxsize; i++) {
+            this.read();
+        }
+        return i;
+    }
+
+
+	/* simple remaining implementation - marks and reset not supported */
+	public boolean markSupported() { return false; }
+	public void mark(int readAheadLimit) throws IOException { return; }
+	public void reset() throws IOException { return; }
+	
+}
Index: src/uk/ac/gla/terrier/utility/FixedSizeInputStream.java
===================================================================
--- src/uk/ac/gla/terrier/utility/FixedSizeInputStream.java	(revision 0)
+++ src/uk/ac/gla/terrier/utility/FixedSizeInputStream.java	(revision 0)
@@ -0,0 +1,129 @@
+/*
+ * Terrier - Terabyte Retriever 
+ * Webpage: http://ir.dcs.gla.ac.uk/terrier 
+ * Contact: terrier{a.}dcs.gla.ac.uk
+ * University of Glasgow - Department of Computing Science
+ * http://www.gla.ac.uk/
+ * 
+ * The contents of this file are subject to the Mozilla Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+ * the License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * The Original Code is FixedSizeInputStream.java
+ *
+ * The Original Code is Copyright (C) 2004-2009 the University of Glasgow.
+ * All Rights Reserved.
+ *
+ * Contributor(s):
+ *   Craig Macdonald (original author)
+ */
+package uk.ac.gla.terrier.utility;
+
+import java.io.IOException;
+import java.io.FilterInputStream;
+import java.io.InputStream;
+
+/** An input stream wrapped that only allowed a given pre-determined number of bytes
+  * to be read from a specified input stream.
+  * @author Craig Macdonald
+  */
+public class FixedSizeInputStream extends FilterInputStream
+{
+	/** maximum bytes to read */
+	protected final long maxsize;
+	/** number of bytes read so far */
+	protected long size;
+
+	/** prevent a close() from doing anyhing, like closing the underlying stream */
+	protected boolean suppressClose = false;
+
+	/** create a new FixedSizeInputStream, using in as the underlying
+	  * InputStream, and maxsize as the maximum number of bytes to read. 
+	  * @param in underlying InputStream to read bytes from.
+	  * @param maxsize maximum number of bytes to read.
+	  */
+	public FixedSizeInputStream(InputStream in, long maxsize)
+	{
+		super(in);
+		this.maxsize = maxsize;
+	}
+
+	/** Read a single byte from the underlying Reader.
+	  * @return The byte read, or -1 if the end of the underlying stream has been reached
+	  * or the maximum allowed number of bytes has been read from it.
+	  * @throws IOException If an I/O error occurs.
+	  */
+	public int read() throws IOException
+	{
+		if (size == maxsize)
+			return -1;
+		final int by = in.read();
+		if (by != -1)
+			size++;
+		//System.err.println("1. size="+size);
+		return by;
+	}
+	
+	/** Read bytes into a portion of an array. 
+	  * @param cbuf Destination buffer
+	  * @param off Offset at which to start storing bytes
+	  * @param len  Maximum number of bytes to read 
+	  * @return The number of bytes read, or -1 if the end of the stream has been reached.
+	  * @throws IOException If an I/O error occurs in the underlying reader.
+	  */
+	public int read(byte[] cbuf, int off, int len) throws IOException
+	{
+		if (size == maxsize)
+            return -1;
+		if (size + len < maxsize)
+		{
+			int rtr = in.read(cbuf, off, len);
+			size += rtr;
+			//System.err.println("2. size="+size +" rtr="+rtr);
+			return rtr;
+		}
+		int rtr = in.read(cbuf, off, (int)(maxsize - size));
+		size += rtr;
+		//System.err.println("3. size="+size +" rtr="+rtr);
+		return rtr;	
+	}
+
+    /**
+     * Skips n bytes from the stream. If the end of
+     * the stream has been reached before reading n bytes
+     * then it returns.
+     * <B>NB:</B> This method uses read() internally.
+     * @param n long the number of characters to skip.
+     * @return long the number of characters skipped.
+     * @throws IOException if there is any error while
+     *       reading from the stream.
+     */
+    public long skip(long n) throws IOException {
+        /* TODO a more efficient implementation could be made */
+        long i = 0;
+        for (; i < n && size < maxsize; i++) {
+            this.read();
+        }
+        return i;
+    }
+
+	/* simple remaining implementation - marks and reset not supported */
+	public boolean markSupported() { return false; }
+	public void mark(int readAheadLimit) { return; }
+	public void reset() throws IOException { return; }
+	/** Call this is you dont want close() to close the parent method */
+	public void suppressClose() { suppressClose = true; }
+	/** Closes the parent close method, unless closing has been suppressed earlier by calling suppressClose(). */
+	public void close() throws IOException
+	{ 
+		if (suppressClose)
+			return;
+		super.close();
+	}
+}
Index: src/uk/ac/gla/terrier/indexing/ClueWARC018Collection.java
===================================================================
--- src/uk/ac/gla/terrier/indexing/ClueWARC018Collection.java	(revision 0)
+++ src/uk/ac/gla/terrier/indexing/ClueWARC018Collection.java	(revision 0)
@@ -0,0 +1,364 @@
+/*
+ * Terrier - Terabyte Retriever 
+ * Webpage: http://ir.dcs.gla.ac.uk/terrier 
+ * Contact: terrier{a.}dcs.gla.ac.uk
+ * University of Glasgow - Department of Computing Science
+ * http://www.gla.ac.uk/
+ * 
+ * The contents of this file are subject to the Mozilla Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
+ * the License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * The Original Code is ClueWARC018Collection.java
+ *
+ * The Original Code is Copyright (C) 2004-2009 the University of Glasgow.
+ * All Rights Reserved.
+ *
+ * Contributor(s):
+ *   Craig Macdonald <craigm{a.}dcs.gla.ac.uk> (original author) 
+ */
+package uk.ac.gla.terrier.indexing;
+
+import uk.ac.gla.terrier.utility.Files;
+import uk.ac.gla.terrier.utility.ApplicationSetup;
+import uk.ac.gla.terrier.utility.StringTools;
+import java.io.Reader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.File;
+import java.io.BufferedReader;
+import org.apache.log4j.Logger;
+import java.util.ArrayList;
+import java.util.Map;
+import java.util.HashMap;
+import uk.ac.gla.terrier.utility.FixedSizeInputStream;
+import java.io.BufferedInputStream;
+import java.io.FileInputStream;
+
+import java.util.regex.Pattern;
+import java.util.regex.Matcher;
+
+/** This object is used to parse WARC format web crawls, v0.18, as used by the
+  * ClueWeb09 web collection.
+  * @author Craig Macdonald
+  */
+public class ClueWARC018Collection implements Collection
+{
+	/** logger for this class */
+	protected static Logger logger = Logger.getLogger(WARC018Collection.class);
+	/** Counts the number of documents that have been found in this file. */
+	protected int documentsInThisFile = 0;
+	/** are we at the end of the collection? */
+	protected boolean eoc = false;
+	/** has the end of the current input file been reached? */
+	protected boolean eof = false;
+	/** the input stream of the current input file */
+	protected InputStream is = null;
+	/** the length of the blob containing the document data */
+	protected long currentDocumentBlobLength = 0;
+	/** properties for the current document */
+	protected Map<String,String> DocProperties = null;
+	/** The list of files to process. */
+	protected ArrayList<String> FilesToProcess = new ArrayList<String>();
+	/** The index in the FilesToProcess of the currently processed file.*/
+	protected int FileNumber = 0;
+
+	/** default constructor for this collection object. Reads files from the system
+	  * default collection.spec file */
+	public ClueWARC018Collection()
+	{
+		this(ApplicationSetup.COLLECTION_SPEC);
+	}
+
+	/** construct a collection from the denoted collection.spec file */
+	public ClueWARC018Collection(final String CollectionSpecFilename)
+	{
+		readCollectionSpec(CollectionSpecFilename);
+		try{
+			openNextFile();
+		} catch (IOException ioe) {
+			logger.error("Problem opening first file ", ioe);
+		}
+	}
+
+	/**
+     * A constructor that reads only the specificed
+     * InputStream.*/
+    public ClueWARC018Collection(InputStream input)
+    {
+        is = input;
+    }
+
+	
+	/** Closes the collection, any files that may be open. */
+	public void close()
+	{
+		try{
+			is.close();
+		} catch (IOException ioe) { 
+			logger.warn("Problem closing collection",ioe);
+		}
+	}
+
+	/** Returns true if the end of the collection has been reached */	
+	public boolean endOfCollection()
+	{
+		return eoc;
+	}
+
+	/** Get the String document identifier of the current document. */
+	public String getDocid()
+	{
+		return DocProperties.get("docno");
+	}
+
+	/** Get the document object representing the current document. */
+	public Document getDocument()
+	{
+		/* TODO: allow other document objects */
+		FixedSizeInputStream fsis = new FixedSizeInputStream(is, currentDocumentBlobLength);
+		fsis.suppressClose();
+		String charset = DocProperties.get("charset");
+		Reader r;
+		if (charset == null)
+		{
+			r = new InputStreamReader(fsis);
+		}
+		else
+		{
+			try{
+				charset = StringTools.normaliseEncoding(charset);
+				logger.debug("Using "+ charset + " to decode "+ DocProperties.get("docno"));
+				r = new InputStreamReader(fsis, charset);
+			} catch (java.io.UnsupportedEncodingException uee) {
+				logger.warn("Encoding "+charset+ " is unrecognised, resorting to system default");
+                r = new InputStreamReader(fsis);
+			} catch (Exception e) {
+				logger.warn("Problem reading documents, perhaps encoding "+charset+ " is unrecognised, trying to read with system default encoding", e);
+				r = new InputStreamReader(fsis);
+			}
+		}	
+		return new TRECDocument(r, DocProperties);
+	}
+
+	protected int parseHeaders(final boolean requireContentLength) throws IOException
+	{
+		int headerSize = 0;
+		boolean foundContentLength = false;
+		while(true)
+		{
+			final String followLine = readLine();
+			final int len = followLine.length();
+			headerSize += len +1;
+			if (len == 0)
+			{
+				if ( (! requireContentLength) || (requireContentLength && foundContentLength))
+					break;
+			}
+			final int colonIndex = followLine.indexOf(':');
+			if (colonIndex < 0)
+			{
+				continue;
+			}
+			final String key = followLine.substring(0,colonIndex).trim().toLowerCase();
+			final String value = followLine.substring(colonIndex+1, len).trim();
+			DocProperties.put(key, value);
+			if (key.equals("content-length"))
+				foundContentLength = true;
+		}
+		return headerSize;
+	}
+
+	/** Move the collection to the start of the next document. */
+	public boolean nextDocument()
+	{
+		DocProperties = new HashMap<String,String>(15);
+		try{
+		warcrecord: while(true)
+		{
+			String line = readLine();
+			//look for a warc line
+			if (line.startsWith("WARC/0.18"))
+			{
+				int headerSize = parseHeaders(true);
+				final long warc_response_length = Long.parseLong(DocProperties.get("content-length"));
+				if (! DocProperties.get("warc-type").equals("response"))
+				{
+					is.skip(warc_response_length);
+					//-49
+					continue warcrecord;
+				}
+				headerSize = parseHeaders(false);
+				DocProperties.put("docno", DocProperties.get("warc-trec-id"));
+				DocProperties.put("url", DocProperties.get("warc-target-uri"));
+				if (logger.isDebugEnabled())
+					logger.debug("Now working on document "+ DocProperties.get("warc-trec-id"));
+
+				//obtain the character set of the document and put in the charset property
+				String cType = DocProperties.get("content-type");
+				//force UTF-8 for english documents - webpage isnt clear:
+				//http://boston.lti.cs.cmu.edu/Data/clueweb09/dataset.html#encodings
+				if (cType != null)
+				{
+					cType = cType.toLowerCase();
+					if (cType.contains("charset"))
+	   				{
+						final Matcher m = charsetMatchPattern.matcher(cType);
+						if (m.find() && m.groupCount() > 0)
+							DocProperties.put("charset", m.group(1));
+					}
+				}
+
+				documentsInThisFile++;
+				currentDocumentBlobLength = warc_response_length - headerSize; //-16
+				return true;
+			}
+			if (eof)
+				if (! openNextFile())
+					return false;
+		}
+		} catch (IOException ioe) {
+			logger.error("IOException while reading WARC format collection file" + ioe);
+		}
+		return false;
+	}
+
+	static final Pattern charsetMatchPattern = Pattern.compile("charset[:=]\\s*['\"]?([0-9a-zA-Z_\\-]+)['\"]?");
+
+	/** read a line from the currently open InputStream is */
+	protected String readLine() throws IOException
+	{
+		final StringBuilder s = new StringBuilder();
+		int c = 0;char ch; char ch2;
+		while(true)
+		{
+			c = is.read();
+			if (c == -1)
+			{
+				eof = true;
+				break;
+			}
+			ch = (char)c;
+			if (ch == '\r')
+			{
+				c = is.read();
+				if (c== -1)
+				{
+					s.append(ch);
+					eof = true;
+					break;
+				}
+				ch2 = (char)c;
+				if (ch2 == '\n')
+					break;
+				s.append(ch); s.append(ch2);
+			}
+			else if (ch == '\n')
+			{
+				break;
+			}
+			else
+			{
+				s.append(ch);
+			}
+		}
+		return s.toString();
+	}
+
+	/**
+	 * Opens the next document from the collection specification.
+	 * @return boolean true if the file was opened successufully. If there
+	 *	   are no more files to open, it returns false.
+	 * @throws IOException if there is an exception while opening the
+	 *	   collection files.
+	 */
+	protected boolean openNextFile() throws IOException {
+		//try to close the currently open file
+		if (is!=null)
+			try{
+				is.close();
+			}catch (IOException ioe) {/* Ignore, it's not an error */ }
+		//keep trying files
+		boolean tryFile = true;
+		//return value for this fn
+		boolean rtr = false;
+		while(tryFile)
+		{
+			if (FileNumber < FilesToProcess.size()) {
+				//SkipFile = true;
+				String filename = (String) FilesToProcess.get(FileNumber);
+				FileNumber++;
+				//check the filename is sane
+				File f = new File(filename);
+				if (! f.exists())
+				{
+					logger.warn("Could not open "+filename+" : File Not Found");
+				}
+				else if (! f.canRead())
+				{
+					logger.warn("Could not open "+filename+" : Cannot read");
+				}
+				else
+				{//filename seems ok, open it
+					//if (filename.toLowerCase().endsWith(".gz"))
+					//{
+						/* WARC format files have multiple compressed records. JDK one can't deal with this
+						 * See: http://crawler.archive.org/apidocs/index.html?org/archive/io/arc/ARCWriter.html
+						 * We get around this by using an external zcat process
+						 */
+					//	is = new ProcessInputStream("/usr/bin/gzip -dc ", filename);
+					//}
+					//else
+						is = Files.openFileStream(filename); //throws an IOException, throw upwards
+					logger.info("WARC018Collection processing "+filename);
+					//no need to loop again
+					tryFile = false;
+					//return success
+					rtr = true;
+					//accurately record file offset
+					documentsInThisFile = 0;
+					eof = false;
+				}
+			} else {
+				//last file of the collection has been read, EOC
+				eoc = true;
+				rtr = false;
+				tryFile = false;
+			}
+		}
+		return rtr;
+	}
+
+	/** read in the collection.spec */
+	protected void readCollectionSpec(String CollectionSpecFilename)
+	{
+		//reads the collection specification file
+		try {
+			BufferedReader br2 = Files.openFileReader(CollectionSpecFilename);
+			String filename = null;
+			FilesToProcess = new ArrayList<String>();
+			while ((filename = br2.readLine()) != null) {
+				filename = filename.trim();
+				if (!filename.startsWith("#") && !filename.equals(""))
+					FilesToProcess.add(filename);
+			}
+			br2.close();
+			logger.info("WARC018Collection read collection specification");
+		} catch (IOException ioe) {
+			logger.error("Input output exception while loading the collection.spec file. "
+							+ "("+CollectionSpecFilename+")", ioe);
+		}
+	}
+
+	/** Resets the Collection iterator to the start of the collection. */
+	public void reset()
+	{}
+
+}
Index: share/MANIFEST.txt
===================================================================
--- share/MANIFEST.txt	(revision 2573)
+++ share/MANIFEST.txt	(working copy)
@@ -118,6 +118,7 @@
 doc/javadoc/uk/ac/gla/terrier/indexing/SimpleXMLCollection.html
 doc/javadoc/uk/ac/gla/terrier/indexing/Tokenizer.html
 doc/javadoc/uk/ac/gla/terrier/indexing/TRECCollection.html
+doc/javadoc/uk/ac/gla/terrier/indexing/ClueWARC018Collection.html
 doc/javadoc/uk/ac/gla/terrier/indexing/TRECDocument.html
 doc/javadoc/uk/ac/gla/terrier/indexing/TRECFullTokenizer.html
 doc/javadoc/uk/ac/gla/terrier/indexing/TRECFullUTFTokenizer.html
@@ -129,6 +130,7 @@
 doc/javadoc/uk/ac/gla/terrier/indexing/class-use/BasicSinglePassIndexer.html
 doc/javadoc/uk/ac/gla/terrier/indexing/class-use/BlockIndexer.html
 doc/javadoc/uk/ac/gla/terrier/indexing/class-use/BlockSinglePassIndexer.html
+doc/javadoc/uk/ac/gla/terrier/indexing/class-use/ClueWARC018Collection.html
 doc/javadoc/uk/ac/gla/terrier/indexing/class-use/CollectionFactory.html
 doc/javadoc/uk/ac/gla/terrier/indexing/class-use/Collection.html
 doc/javadoc/uk/ac/gla/terrier/indexing/class-use/CreateDocumentInitialWeightIndex.html
@@ -700,6 +702,8 @@
 doc/javadoc/uk/ac/gla/terrier/utility/FieldScore.html
 doc/javadoc/uk/ac/gla/terrier/utility/Files.html
 doc/javadoc/uk/ac/gla/terrier/utility/Files.FSCapability.html
+doc/javadoc/uk/ac/gla/terrier/utility/FixedSizeInputStream.html
+doc/javadoc/uk/ac/gla/terrier/utility/FixedSizeReader.html
 doc/javadoc/uk/ac/gla/terrier/utility/HeapSort.html
 doc/javadoc/uk/ac/gla/terrier/utility/LookAheadReader.html
 doc/javadoc/uk/ac/gla/terrier/utility/LookAheadStream.html
@@ -903,6 +907,7 @@
 src/uk/ac/gla/terrier/indexing/BasicSinglePassIndexer.java
 src/uk/ac/gla/terrier/indexing/BlockIndexer.java
 src/uk/ac/gla/terrier/indexing/BlockSinglePassIndexer.java
+src/uk/ac/gla/terrier/indexing/ClueWARC018Collection.java
 src/uk/ac/gla/terrier/indexing/CollectionFactory.java
 src/uk/ac/gla/terrier/indexing/Collection.java
 src/uk/ac/gla/terrier/indexing/CreateDocumentInitialWeightIndex.java
@@ -1163,6 +1168,8 @@
 src/uk/ac/gla/terrier/utility/ArrayUtils.java
 src/uk/ac/gla/terrier/utility/FieldScore.java
 src/uk/ac/gla/terrier/utility/Files.java
+src/uk/ac/gla/terrier/utility/FixedSizeInputStream.java
+src/uk/ac/gla/terrier/utility/FixedSizeReader.java
 src/uk/ac/gla/terrier/utility/HeapSort.java
 src/uk/ac/gla/terrier/utility/io/CountingInputStream.java
 src/uk/ac/gla/terrier/utility/io/FileSystem.java

