?? lexertests.java
字號:
mAcceptable.add ("FORM"); mAcceptable.add ("INPUT"); mAcceptable.add ("!DOCTYPE"); mAcceptable.add ("TBODY"); mAcceptable.add ("B"); mAcceptable.add ("DIV"); mAcceptable.add ("SCRIPT"); mAcceptable.add ("NOSCRIPT"); mAcceptable.add ("STYLE"); mAcceptable.add ("SPAN"); mAcceptable.add ("UL"); mAcceptable.add ("LI"); mAcceptable.add ("IFRAME"); } /** * Test case for bug #789439 Japanese page causes OutOfMemory Exception * No exception is thrown in the current version of the parser, * however, the problem is that ISO-2022-JP (aka JIS) encoding sometimes * causes spurious tags. * The root cause is characters bracketed by [esc]$B and [esc](J (contrary * to what is indicated in then j_s_nightingale analysis of the problem) that * sometimes have an angle bracket (< or 0x3c) embedded in them. These * are taken to be tags by the parser, instead of being considered strings. * <p> * The URL refrenced has an ISO-8859-1 encoding (the default), but * Japanese characters intermixed on the page with English, using the JIS * encoding. We detect failure by looking for weird tag names which were * not correctly handled as string nodes. * <p> * Here is a partial dump of the page with escape sequences: * <pre> * 0002420 1b 24 42 3f 79 4a 42 25 47 25 38 25 2b 25 61 43 * 0002440 35 44 65 43 44 1b 28 4a 20 77 69 74 68 20 43 61 * .. * 0002720 6c 22 3e 4a 53 6b 79 1b 24 42 42 50 31 7e 25 5a * 0002740 21 3c 25 38 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a 3c * .. * 0003060 20 69 1b 24 42 25 62 21 3c 25 49 42 50 31 7e 25 * 0003100 5a 21 3c 25 38 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a * .. * 0003220 1b 24 42 25 2d 25 3f 25 5e 25 2f 25 69 24 4e 25 * 0003240 5b 21 3c 25 60 25 5a 21 3c 25 38 1b 28 4a 3c 2f * .. * 0003320 6e 65 31 2e 70 6c 22 3e 1b 24 42 3d 60 48 77 43 * 0003340 66 1b 28 4a 3c 2f 41 3e 3c 50 3e 0a 2d 2d 2d 2d * .. * 0004400 46 6f 72 75 6d 20 30 30 39 20 28 1b 24 42 3e 21 * 0004420 3c 6a 24 4b 31 4a 4a 21 44 2e 24 4a 24 49 1b 28 * 0004440 4a 29 3c 2f 41 3e 3c 49 4d 47 20 53 52 43 3d 22 * </pre> * <p> * The fix proposed by j_s_nightingale is implemented to swallow JIS * escape sequences in the string parser. * Apparently the fix won't help EUC-JP and Shift-JIS though, so this may * still be a problem. * It's theoretically possible that JIS encoding, or another one, * could be used as attribute names or values within tags as well, * but this is considered improbable and is therefore not handled in * the tag parser state machine. */ public void testJIS () throws ParserException { Parser parser; NodeIterator iterator; parser = new Parser ("http://www.009.com/"); try { iterator = parser.elements (); while (iterator.hasMoreNodes ()) checkTagNames (iterator.nextNode ()); } catch (EncodingChangeException ece) { parser.reset (); iterator = parser.elements (); while (iterator.hasMoreNodes ()) checkTagNames (iterator.nextNode ()); } } /** * Check the tag name for one of the ones expected on the page. * Recursively check the children. */ public void checkTagNames (Node node) { Tag tag; String name; NodeList children; if (node instanceof Tag) { tag = (Tag)node; name = tag.getTagName (); if (!mAcceptable.contains (name)) fail ("unrecognized tag name \"" + name + "\""); children = tag.getChildren (); if (null != children) for (int i = 0; i < children.size (); i++) checkTagNames (children.elementAt (i)); } } /** * See bug #825820 Words conjoined */ public void testConjoined () throws ParserException { StringBuffer buffer; NodeIterator iterator; Node node; String expected; expected = "The Title\nThis is the body."; String html1 = "<html><title>The Title\n</title>" + "<body>This is <a href=\"foo.html\">the body</a>.</body></html>"; createParser (html1); buffer = new StringBuffer (); for (iterator = parser.elements (); iterator.hasMoreNodes (); ) { node = iterator.nextNode (); String text = node.toPlainTextString (); buffer.append (text); } assertStringEquals ("conjoined text", expected, buffer.toString ()); String html2 = "<html><title>The Title</title>\n" + "<body>This is <a href=\"foo.html\">the body</a>.</body></html>"; createParser (html2); buffer = new StringBuffer (); for (iterator = parser.elements (); iterator.hasMoreNodes (); ) { node = iterator.nextNode (); String text = node.toPlainTextString (); buffer.append (text); } assertStringEquals ("conjoined text", expected, buffer.toString ()); String html3 = "<html><title>The Title</title>" + "<body>\nThis is <a href=\"foo.html\">the body</a>.</body></html>"; createParser (html3); buffer = new StringBuffer (); for (iterator = parser.elements (); iterator.hasMoreNodes (); ) { node = iterator.nextNode (); String text = node.toPlainTextString (); buffer.append (text); } assertStringEquals ("conjoined text", expected, buffer.toString ()); } /** * Check for StackOverflow error. */ public void testStackOverflow () throws ParserException { NodeIterator iterator; Node node; String html; html = "<a href = \"http://test.com\" />"; createParser (html); for (iterator = parser.elements (); iterator.hasMoreNodes (); ) { node = iterator.nextNode (); String text = node.toHtml (); assertStringEquals ("no overflow", html, text); } html = "<a href=\"http://test.com\"/>"; createParser (html); for (iterator = parser.elements (); iterator.hasMoreNodes (); ) { node = iterator.nextNode (); String text = node.toHtml (); assertStringEquals ("no overflow", html, text); } html = "<a href = \"http://test.com\"/>"; createParser (html); for (iterator = parser.elements (); iterator.hasMoreNodes (); ) { node = iterator.nextNode (); String text = node.toHtml (); assertStringEquals ("no overflow", html, text); } } /** * See bug #880283 Character ">" erroneously inserted by Lexer */ public void testJsp () throws ParserException { String html; Lexer lexer; Node node; html = "<% out.urlEncode('abc') + \"<br>\" + out.urlEncode('xyz') %>"; lexer = new Lexer (html); node = lexer.nextNode (); if (node == null) fail ("too few nodes"); else assertStringEquals ("bad html", html, node.toHtml()); assertNull ("too many nodes", lexer.nextNode ()); } /** * See bug #899413 bug in javascript end detection. */ public void testEscapedQuote () throws ParserException { String string; String html; Lexer lexer; Node node; string = "\na='\\'';\n"; html = string + "</script>"; lexer = new Lexer (html); node = lexer.nextNode (true); if (node == null) fail ("too few nodes"); else assertStringEquals ("bad string", string, node.toHtml()); assertNotNull ("too few nodes", lexer.nextNode (true)); assertNull ("too many nodes", lexer.nextNode (true)); } /** * See bug #1227213 Particular SCRIPT tags close too late. */ public void testCommentInScript () throws ParserException { String tag; String cdata; String endtag; String html; Parser parser; NodeIterator iterator; Node node; tag = "<script>"; cdata = "<!--document.write(\"en\");// -->"; endtag = "</script>"; html = tag + cdata + endtag; parser = new Parser (); parser.setInputHTML (html); iterator = parser.elements (); node = iterator.nextNode (); if (node == null) fail ("too few nodes"); else assertStringEquals ("bad parse", html, node.toHtml()); assertTrue (node instanceof ScriptTag); assertStringEquals ("bad cdata", cdata, ((ScriptTag)node).getScriptCode ()); assertNull ("too many nodes", iterator.nextNode ()); } /** * See bug #1227213 Particular SCRIPT tags close too late. * This was actually working prior to the patch, since the * ScriptScanner didn't use smartquote processing. * I'm not sure why jwilsonsprings1 said the patch worked * for him. I can only assume he was mistaken in thinking * it was the URL that caused the failure. */ public void testUrlInStyle () throws ParserException { String tag; String cdata; String endtag; String html; Parser parser; NodeIterator iterator; Node node; tag = "<style>"; cdata = ".eSDot {background-image:" + "url(http://di.image.eshop.msn.com/img/sys/dot.gif)}"; endtag = "</style>"; html = tag + cdata + endtag; parser = new Parser (); parser.setInputHTML (html); iterator = parser.elements (); node = iterator.nextNode (); if (node == null) fail ("too few nodes"); else assertStringEquals ("bad parse", html, node.toHtml()); assertTrue (node instanceof StyleTag); assertStringEquals ("bad cdata", cdata, ((StyleTag)node).getStyleCode ()); assertNull ("too many nodes", iterator.nextNode ()); }}
?? 快捷鍵說明
復制代碼
Ctrl + C
搜索代碼
Ctrl + F
全屏模式
F11
切換主題
Ctrl + Shift + D
顯示快捷鍵
?
增大字號
Ctrl + =
減小字號
Ctrl + -