?? experimentalwarcwritertest.java
字號:
/* * ExperimentalWARCWriterTest * * $Id: ExperimentalWARCWriterTest.java,v 1.12 2006/08/30 02:35:48 stack-sf Exp $ * * Created on July 27th, 2006 * * Copyright (C) 2006 Internet Archive. * * This file is part of the Heritrix web crawler (crawler.archive.org). * * Heritrix is free software; you can redistribute it and/or modify * it under the terms of the GNU Lesser Public License as published by * the Free Software Foundation; either version 2.1 of the License, or * any later version. * * Heritrix is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU Lesser Public License for more details. * * You should have received a copy of the GNU Lesser Public License * along with Heritrix; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */package org.archive.io.warc;import java.io.ByteArrayInputStream;import java.io.ByteArrayOutputStream;import java.io.File;import java.io.FileNotFoundException;import java.io.IOException;import java.net.URI;import java.net.URISyntaxException;import java.util.Arrays;import java.util.Iterator;import java.util.List;import java.util.concurrent.atomic.AtomicInteger;import org.archive.io.ArchiveRecord;import org.archive.io.ArchiveRecordHeader;import org.archive.io.UTF8Bytes;import org.archive.io.WriterPoolMember;import org.archive.uid.GeneratorFactory;import org.archive.util.ArchiveUtils;import org.archive.util.TmpDirTestCase;import org.archive.util.anvl.ANVLRecord;/** * Test Writer and Reader. * @author stack * @version $Date: 2006/08/30 02:35:48 $ $Version$ */public class ExperimentalWARCWriterTestextends TmpDirTestCase implements WARCConstants { private static final AtomicInteger SERIAL_NO = new AtomicInteger(); /** * Prefix to use for ARC files made by JUNIT. */ private static final String PREFIX = "IAH"; private static final String SOME_URL = "http://www.archive.org/test/"; public void testCheckHeaderLineValue() throws Exception { ExperimentalWARCWriter writer = new ExperimentalWARCWriter(); writer.checkHeaderLineParameters("one"); IOException exception = null; try { writer.checkHeaderLineParameters("with space"); } catch(IOException e) { exception = e; } assertNotNull(exception); exception = null; try { writer.checkHeaderLineParameters("with\0x0000controlcharacter"); } catch(IOException e) { exception = e; } assertNotNull(exception); } public void testMimetypes() throws IOException { ExperimentalWARCWriter writer = new ExperimentalWARCWriter(); writer.checkHeaderLineMimetypeParameter("text/xml"); writer.checkHeaderLineMimetypeParameter("text/xml+rdf"); writer.checkHeaderLineMimetypeParameter( "text/plain; charset=SHIFT-JIS"); System.out.println(writer.checkHeaderLineMimetypeParameter( "multipart/mixed; \r\n boundary=\"simple boundary\"")); } public void testWriteRecord() throws IOException { File [] files = {getTmpDir()}; // Write uncompressed. ExperimentalWARCWriter writer = new ExperimentalWARCWriter(SERIAL_NO, Arrays.asList(files), this.getClass().getName(), "suffix", false, -1, null); writeFile(writer); // Write compressed. writer = new ExperimentalWARCWriter(SERIAL_NO, Arrays.asList(files), this.getClass().getName(), "suffix", true, -1, null); writeFile(writer); } private void writeFile(final ExperimentalWARCWriter writer) throws IOException { try { writeWarcinfoRecord(writer); writeBasicRecords(writer); } finally { writer.close(); writer.getFile().delete(); } } private void writeWarcinfoRecord(ExperimentalWARCWriter writer) throws IOException { ANVLRecord meta = new ANVLRecord(); meta.addLabelValue("size", "1G"); meta.addLabelValue("operator", "igor"); byte [] bytes = meta.getUTF8Bytes(); writer.writeWarcinfoRecord(ANVLRecord.MIMETYPE, null, new ByteArrayInputStream(bytes), bytes.length); } protected void writeBasicRecords(final ExperimentalWARCWriter writer) throws IOException { ANVLRecord headerFields = new ANVLRecord(); headerFields.addLabelValue("x", "y"); headerFields.addLabelValue("a", "b"); URI rid = null; try { rid = GeneratorFactory.getFactory(). getQualifiedRecordID(TYPE, METADATA); } catch (URISyntaxException e) { // Convert to IOE so can let it out. throw new IOException(e.getMessage()); } final String content = "Any old content."; for (int i = 0; i < 10; i++) { String body = i + ". " + content; byte [] bodyBytes = body.getBytes(UTF8Bytes.UTF8); writer.writeRecord(METADATA, "http://www.archive.org/", ArchiveUtils.get14DigitDate(), "no/type", rid, headerFields, new ByteArrayInputStream(bodyBytes), (long)bodyBytes.length); } } /** * @return Generic HTML Content. */ protected static String getContent() { return getContent(null); } /** * @return Generic HTML Content with mention of passed <code>indexStr</code> * in title and body. */ protected static String getContent(String indexStr) { String page = (indexStr != null)? "Page #" + indexStr: "Some Page"; return "HTTP/1.1 200 OK\r\n" + "Content-Type: text/html\r\n\r\n" + "<html><head><title>" + page + "</title></head>" + "<body>" + page + "</body></html>"; } /** * Write random HTML Record. * @param w Where to write. * @param index An index to put into content. * @return Length of record written. * @throws IOException */ protected int writeRandomHTTPRecord(ExperimentalWARCWriter w, int index) throws IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); String indexStr = Integer.toString(index); byte[] record = (getContent(indexStr)).getBytes(); int recordLength = record.length; baos.write(record); // Add named fields for ip, checksum, and relate the metadata // and request to the resource field. ANVLRecord r = new ANVLRecord(1); r.addLabelValue(NAMED_FIELD_IP_LABEL, "127.0.0.1"); w.writeResourceRecord( "http://www.one.net/id=" + indexStr, ArchiveUtils.get14DigitDate(), "text/html; charset=UTF-8", r, new ByteArrayInputStream(baos.toByteArray()), recordLength); return recordLength; } /** * Fill a WARC with HTML Records. * @param baseName WARC basename. * @param compress Whether to compress or not. * @param maxSize Maximum WARC size. * @param recordCount How many records. * @return The written file. * @throws IOException */ private File writeRecords(String baseName, boolean compress, int maxSize, int recordCount) throws IOException { cleanUpOldFiles(baseName); File [] files = {getTmpDir()}; ExperimentalWARCWriter w = new ExperimentalWARCWriter(SERIAL_NO, Arrays.asList(files), baseName + '-' + PREFIX, "", compress, maxSize, null); assertNotNull(w); for (int i = 0; i < recordCount; i++) { writeRandomHTTPRecord(w, i); } w.close(); assertTrue("Doesn't exist: " + w.getFile().getAbsolutePath(), w.getFile().exists()); return w.getFile(); } /** * Run validation of passed file. * @param f File to validate. * @param recordCount Expected count of records.
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -