?? package-summary.html
字號:
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"><!--NewPage--><HTML><HEAD><!-- Generated by javadoc (build 1.5.0_07) on Sun May 06 18:00:00 GMT 2007 --><TITLE>org.archive.util.ms (Heritrix 1.12.1)</TITLE><META NAME="keywords" CONTENT="org.archive.util.ms package"><LINK REL ="stylesheet" TYPE="text/css" HREF="../../../../stylesheet.css" TITLE="Style"><SCRIPT type="text/javascript">function windowTitle(){ parent.document.title="org.archive.util.ms (Heritrix 1.12.1)";}</SCRIPT><NOSCRIPT></NOSCRIPT></HEAD><BODY BGCOLOR="white" onload="windowTitle();"><!-- ========= START OF TOP NAVBAR ======= --><A NAME="navbar_top"><!-- --></A><A HREF="#skip-navbar_top" title="Skip navigation links"></A><TABLE BORDER="0" WIDTH="100%" CELLPADDING="1" CELLSPACING="0" SUMMARY=""><TR><TD COLSPAN=2 BGCOLOR="#EEEEFF" CLASS="NavBarCell1"><A NAME="navbar_top_firstrow"><!-- --></A><TABLE BORDER="0" CELLPADDING="0" CELLSPACING="3" SUMMARY=""> <TR ALIGN="center" VALIGN="top"> <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../overview-summary.html"><FONT CLASS="NavBarFont1"><B>Overview</B></FONT></A> </TD> <TD BGCOLOR="#FFFFFF" CLASS="NavBarCell1Rev"> <FONT CLASS="NavBarFont1Rev"><B>Package</B></FONT> </TD> <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <FONT CLASS="NavBarFont1">Class</FONT> </TD> <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="package-use.html"><FONT CLASS="NavBarFont1"><B>Use</B></FONT></A> </TD> <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="package-tree.html"><FONT CLASS="NavBarFont1"><B>Tree</B></FONT></A> </TD> <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../deprecated-list.html"><FONT CLASS="NavBarFont1"><B>Deprecated</B></FONT></A> </TD> <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../index-all.html"><FONT CLASS="NavBarFont1"><B>Index</B></FONT></A> </TD> <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../help-doc.html"><FONT CLASS="NavBarFont1"><B>Help</B></FONT></A> </TD> </TR></TABLE></TD><TD ALIGN="right" VALIGN="top" ROWSPAN=3><EM></EM></TD></TR><TR><TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2"> <A HREF="../../../../org/archive/util/iterator/package-summary.html"><B>PREV PACKAGE</B></A> NEXT PACKAGE</FONT></TD><TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2"> <A HREF="../../../../index.html?org/archive/util/ms/package-summary.html" target="_top"><B>FRAMES</B></A> <A HREF="package-summary.html" target="_top"><B>NO FRAMES</B></A> <SCRIPT type="text/javascript"> <!-- if(window==top) { document.writeln('<A HREF="../../../../allclasses-noframe.html"><B>All Classes</B></A>'); } //--></SCRIPT><NOSCRIPT> <A HREF="../../../../allclasses-noframe.html"><B>All Classes</B></A></NOSCRIPT></FONT></TD></TR></TABLE><A NAME="skip-navbar_top"></A><!-- ========= END OF TOP NAVBAR ========= --><HR><H2>Package org.archive.util.ms</H2>Memory-efficient reading of .doc files.<P><B>See:</B><BR> <A HREF="#package_description"><B>Description</B></A><P><TABLE BORDER="1" WIDTH="100%" CELLPADDING="3" CELLSPACING="0" SUMMARY=""><TR BGCOLOR="#CCCCFF" CLASS="TableHeadingColor"><TH ALIGN="left" COLSPAN="2"><FONT SIZE="+2"><B>Interface Summary</B></FONT></TH></TR><TR BGCOLOR="white" CLASS="TableRowColor"><TD WIDTH="15%"><B><A HREF="../../../../org/archive/util/ms/BlockFileSystem.html" title="interface in org.archive.util.ms">BlockFileSystem</A></B></TD><TD>Describes the internal file system contained in .doc files.</TD></TR><TR BGCOLOR="white" CLASS="TableRowColor"><TD WIDTH="15%"><B><A HREF="../../../../org/archive/util/ms/Entry.html" title="interface in org.archive.util.ms">Entry</A></B></TD><TD> </TD></TR></TABLE> <P><TABLE BORDER="1" WIDTH="100%" CELLPADDING="3" CELLSPACING="0" SUMMARY=""><TR BGCOLOR="#CCCCFF" CLASS="TableHeadingColor"><TH ALIGN="left" COLSPAN="2"><FONT SIZE="+2"><B>Class Summary</B></FONT></TH></TR><TR BGCOLOR="white" CLASS="TableRowColor"><TD WIDTH="15%"><B><A HREF="../../../../org/archive/util/ms/BlockInputStream.html" title="class in org.archive.util.ms">BlockInputStream</A></B></TD><TD>InputStream for a file contained in a BlockFileSystem.</TD></TR><TR BGCOLOR="white" CLASS="TableRowColor"><TD WIDTH="15%"><B><A HREF="../../../../org/archive/util/ms/Cp1252.html" title="class in org.archive.util.ms">Cp1252</A></B></TD><TD>A fast implementation of code page 1252.</TD></TR><TR BGCOLOR="white" CLASS="TableRowColor"><TD WIDTH="15%"><B><A HREF="../../../../org/archive/util/ms/DefaultBlockFileSystem.html" title="class in org.archive.util.ms">DefaultBlockFileSystem</A></B></TD><TD>Default implementation of the Block File System.</TD></TR><TR BGCOLOR="white" CLASS="TableRowColor"><TD WIDTH="15%"><B><A HREF="../../../../org/archive/util/ms/DefaultEntry.html" title="class in org.archive.util.ms">DefaultEntry</A></B></TD><TD> </TD></TR><TR BGCOLOR="white" CLASS="TableRowColor"><TD WIDTH="15%"><B><A HREF="../../../../org/archive/util/ms/Doc.html" title="class in org.archive.util.ms">Doc</A></B></TD><TD>Reads .doc files.</TD></TR><TR BGCOLOR="white" CLASS="TableRowColor"><TD WIDTH="15%"><B><A HREF="../../../../org/archive/util/ms/HeaderBlock.html" title="class in org.archive.util.ms">HeaderBlock</A></B></TD><TD> </TD></TR><TR BGCOLOR="white" CLASS="TableRowColor"><TD WIDTH="15%"><B><A HREF="../../../../org/archive/util/ms/Piece.html" title="class in org.archive.util.ms">Piece</A></B></TD><TD> </TD></TR><TR BGCOLOR="white" CLASS="TableRowColor"><TD WIDTH="15%"><B><A HREF="../../../../org/archive/util/ms/PieceReader.html" title="class in org.archive.util.ms">PieceReader</A></B></TD><TD> </TD></TR><TR BGCOLOR="white" CLASS="TableRowColor"><TD WIDTH="15%"><B><A HREF="../../../../org/archive/util/ms/PieceTable.html" title="class in org.archive.util.ms">PieceTable</A></B></TD><TD>The piece table of a .doc file.</TD></TR></TABLE> <P><TABLE BORDER="1" WIDTH="100%" CELLPADDING="3" CELLSPACING="0" SUMMARY=""><TR BGCOLOR="#CCCCFF" CLASS="TableHeadingColor"><TH ALIGN="left" COLSPAN="2"><FONT SIZE="+2"><B>Enum Summary</B></FONT></TH></TR><TR BGCOLOR="white" CLASS="TableRowColor"><TD WIDTH="15%"><B><A HREF="../../../../org/archive/util/ms/Entry.EntryType.html" title="enum in org.archive.util.ms">Entry.EntryType</A></B></TD><TD> </TD></TR></TABLE> <P><A NAME="package_description"><!-- --></A><H2>Package org.archive.util.ms Description</H2><P>Memory-efficient reading of .doc files. To extract the text from a .docfile, use <A HREF="../../../../org/archive/util/ms/Doc.html#getText(org.archive.io.SeekInputStream)"><CODE>Doc.getText(SeekInputStream)</CODE></A>. That'sbasically the whole API. The other classes are necessary to make thatmethod work, and you can probably ignore them. <h2>Implementation/Format Details</h2><p>These APIs differ from the POI API provided by Apache in that POI wants to load complete documents into memory. Though POI does provide an "event-driven" API that is memory efficient, that API cannot be used to scan text across block or piece boundaries.<p>This package provides a stream-based API for extracting the text ofa .doc file. At this time, the package does not provide a way to extract style attributes, embedded images, subdocuments, change tracking information,and so on.<p>There are two layers of abstraction between the contents of a .docfile and reality. The first layer is the <i>Block File System</i>, andthe second layer is the <i>piece table</i>.<h3>The Block File System</h3><p>All .doc files are secretly file systems, like a .iso file, but insane.A good overview of how this file system is arranged inside the file isavailable at <a href="http://jakarta.apache.org/poi/poifs/fileformat.html">the Jarkarta POIFS</a> system.<p>Subfiles and directories in a block file system are represented via the<A HREF="../../../../org/archive/util/ms/Entry.html" title="interface in org.archive.util.ms"><CODE>Entry</CODE></A> interface. The root directory can beobtained via the <A HREF="../../../../org/archive/util/ms/BlockFileSystem.html#getRoot()"><CODE>BlockFileSystem.getRoot()</CODE></A>method. From there, the child entries can be discovered.<p>The file system divides its subfiles into 512-byte blocks. Those blocksare not necessarily stored in a linear order; blocks from different subfilesmay be interspersed with each other. The <A HREF="../../../../org/archive/util/ms/Entry.html#open()"><CODE>Entry.open()</CODE></A> method returns an input stream thatprovides a continuous view of a subfile's contents. It does so by moving the file pointer of the .doc file behind the scenes.<p>It's important to keep in mind that any given read on a stream producedby a BlockFileSystem may involve:<ol><li>Moving the file pointer to the start of the file to look up the mainblock allocation table.</li><li>Navigation the file pointer through various allocation structures locatedthroughout the file.</li><li>Finally repositioning the file pointer at the start of the next blockto be read.</li></ol><p>So, this package lowers memory consumption at the expense of greater IOactivity. A future version of this package will use internal caches to minimize IO activity, providing tunable trade-offs between memory and IO.<h3>The Piece Table</h3><p>The second layer of abstraction between you and the contents of a .doc file is the piece table. Some .doc files are produced using a "fast-save" featurethat only writes recent changes to the end of the file. In this case, thetext of the document may be fragmented within the document stream itself.Note that this fragmentation is in addition to the block fragmentationdescribed above.<p>A .doc file contains several subfiles within its filesystem. The two that are important for extracting text are named <code>WordDocument</code>and <code>0Table</code>. The <code>WordDocument</code> subfile contains the text of the document. The <code>0Table</code> subfile contains supportinginformation, including the piece table.<p>The piece table is a simple map from logical character position to actualsubfile stream position. Additionally, each piece table entry describes whetheror not the piece stores text using 16-bit Unicode, or using 8-bit ANSIcodes. One .doc file can contain both Unicode and ANSI text. A consequenceof this is that <i>every</i> .doc file has a piece table, even those thatwere not "fast-saved".<p>The reader returned by <CODE>org.achive.util.ms.Doc#getText(SeekInputStream)</CODE> consults the piecetable to determine where in the WordDocument subfile the next piece of textis located. It also uses the piece table to determine how bytes should beconverted to Unicode characters.<p>Note, however, that any read from such a reader may involve:<ol><li>Moving the file pointer to the piece table.</li><li>Searching the piece table index for the next piece, which mayinvolve moving the file pointer many times.</li><li>Moving the file pointer to that piece's description in the piece table.</li><li>Moving the file pointer to the start of the piece indicated by thedescription.</li></ol>Since the "file pointer" in this context is the file pointer of the <i>subfile</i>, each move described above may additionally involve:<ol><li>Moving the file pointer to the piece table.</li><li>Searching the piece table index for the next piece, which mayinvolve moving the file pointer many times.</li><li>Moving the file pointer to that piece's description in the piece table.</li><li>Moving the file pointer to the start of the piece indicated by thedescription.</li></ol>A future implementation will provide an intelligent cache of the piece table,which will hopefully reduce the IO activity required.<P><P><DL></DL><HR><!-- ======= START OF BOTTOM NAVBAR ====== --><A NAME="navbar_bottom"><!-- --></A><A HREF="#skip-navbar_bottom" title="Skip navigation links"></A><TABLE BORDER="0" WIDTH="100%" CELLPADDING="1" CELLSPACING="0" SUMMARY=""><TR><TD COLSPAN=2 BGCOLOR="#EEEEFF" CLASS="NavBarCell1"><A NAME="navbar_bottom_firstrow"><!-- --></A><TABLE BORDER="0" CELLPADDING="0" CELLSPACING="3" SUMMARY=""> <TR ALIGN="center" VALIGN="top"> <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../overview-summary.html"><FONT CLASS="NavBarFont1"><B>Overview</B></FONT></A> </TD> <TD BGCOLOR="#FFFFFF" CLASS="NavBarCell1Rev"> <FONT CLASS="NavBarFont1Rev"><B>Package</B></FONT> </TD> <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <FONT CLASS="NavBarFont1">Class</FONT> </TD> <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="package-use.html"><FONT CLASS="NavBarFont1"><B>Use</B></FONT></A> </TD> <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="package-tree.html"><FONT CLASS="NavBarFont1"><B>Tree</B></FONT></A> </TD> <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../deprecated-list.html"><FONT CLASS="NavBarFont1"><B>Deprecated</B></FONT></A> </TD> <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../index-all.html"><FONT CLASS="NavBarFont1"><B>Index</B></FONT></A> </TD> <TD BGCOLOR="#EEEEFF" CLASS="NavBarCell1"> <A HREF="../../../../help-doc.html"><FONT CLASS="NavBarFont1"><B>Help</B></FONT></A> </TD> </TR></TABLE></TD><TD ALIGN="right" VALIGN="top" ROWSPAN=3><EM></EM></TD></TR><TR><TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2"> <A HREF="../../../../org/archive/util/iterator/package-summary.html"><B>PREV PACKAGE</B></A> NEXT PACKAGE</FONT></TD><TD BGCOLOR="white" CLASS="NavBarCell2"><FONT SIZE="-2"> <A HREF="../../../../index.html?org/archive/util/ms/package-summary.html" target="_top"><B>FRAMES</B></A> <A HREF="package-summary.html" target="_top"><B>NO FRAMES</B></A> <SCRIPT type="text/javascript"> <!-- if(window==top) { document.writeln('<A HREF="../../../../allclasses-noframe.html"><B>All Classes</B></A>'); } //--></SCRIPT><NOSCRIPT> <A HREF="../../../../allclasses-noframe.html"><B>All Classes</B></A></NOSCRIPT></FONT></TD></TR></TABLE><A NAME="skip-navbar_bottom"></A><!-- ======== END OF BOTTOM NAVBAR ======= --><HR>Copyright © 2003-2007 Internet Archive. All Rights Reserved.</BODY></HTML>
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -