当前位置: 技术问答>java相关
给1100分!
来源: 互联网 发布时间:2017-03-28
本文导语: 请问怎样从一个字符串解析Html? 比方说,我有一个含有Html代码的String,现在想把这个String里的Html的Tag全部解析出来并保持Html的结构不变. 例如 String str="ntttnAAAn"; 问T解决了另开10贴,一起给1100分! | ...
请问怎样从一个字符串解析Html?
比方说,我有一个含有Html代码的String,现在想把这个String里的Html的Tag全部解析出来并保持Html的结构不变.
例如
String str="ntttnAAAn";
问T解决了另开10贴,一起给1100分!
比方说,我有一个含有Html代码的String,现在想把这个String里的Html的Tag全部解析出来并保持Html的结构不变.
例如
String str="ntttnAAAn";
问T解决了另开10贴,一起给1100分!
|
看看这个行不行, 有问题再找我
可参考 JDK 的 javax.swing.text.html 包及javax.swing.text.html.parser
包的原码
import javax.swing.text.html.parser.*;
import java.util.*;
import java.io.*;
public class DocumentParser extends javax.swing.text.html.parser.Parser {
private boolean ignoreCharSet = false;
private static final boolean debugFlag = true;
public DocumentParser(DTD dtd) {
super(dtd);
}
public void parse(Reader in, boolean ignoreCharSet) throws IOException {
this.ignoreCharSet = ignoreCharSet;
parse(in);
}
/**
* Handle Start Tag.
*/
protected void handleStartTag(TagElement tag) {
if (debugFlag) {
if (tag.fictional()) {
debug("Start Tag: " + tag.getHTMLTag() );
} else {
debug("Start Tag: attributes: " +
getAttributes() );
}
}
}
protected void handleComment(char text[]) {
if (debugFlag) {
debug("comment: ->" + new String(text));
}
}
/**
* Handle Empty Tag.
*/
protected void handleEmptyTag(TagElement tag){
if (debugFlag) {
if (tag.fictional()) {
debug("Empty Tag: " + tag.getHTMLTag() );
} else {
debug("Empty Tag: attributes: "
+ getAttributes() );
}
}
}
/**
* Handle End Tag.
*/
protected void handleEndTag(TagElement tag) {
if (debugFlag) {
debug("End Tag: ");
}
}
/**
* Handle Text.
*/
protected void handleText(char data[]) {
if (debugFlag) {
debug("text: ->" + new String(data) );
}
}
/*
* Error handling.
*/
protected void handleError(int ln, String errorMsg) {
}
/*
* debug messages
*/
private void debug(String msg) {
System.out.println(msg);
}
public static void main(String[] args) throws Exception{
String str="ntttnAAAn";
StringReader reader = new StringReader(str);
new DocumentParser(DTD.getDTD("html40")).parse(reader, false);
}
}
可参考 JDK 的 javax.swing.text.html 包及javax.swing.text.html.parser
包的原码
import javax.swing.text.html.parser.*;
import java.util.*;
import java.io.*;
public class DocumentParser extends javax.swing.text.html.parser.Parser {
private boolean ignoreCharSet = false;
private static final boolean debugFlag = true;
public DocumentParser(DTD dtd) {
super(dtd);
}
public void parse(Reader in, boolean ignoreCharSet) throws IOException {
this.ignoreCharSet = ignoreCharSet;
parse(in);
}
/**
* Handle Start Tag.
*/
protected void handleStartTag(TagElement tag) {
if (debugFlag) {
if (tag.fictional()) {
debug("Start Tag: " + tag.getHTMLTag() );
} else {
debug("Start Tag: attributes: " +
getAttributes() );
}
}
}
protected void handleComment(char text[]) {
if (debugFlag) {
debug("comment: ->" + new String(text));
}
}
/**
* Handle Empty Tag.
*/
protected void handleEmptyTag(TagElement tag){
if (debugFlag) {
if (tag.fictional()) {
debug("Empty Tag: " + tag.getHTMLTag() );
} else {
debug("Empty Tag: attributes: "
+ getAttributes() );
}
}
}
/**
* Handle End Tag.
*/
protected void handleEndTag(TagElement tag) {
if (debugFlag) {
debug("End Tag: ");
}
}
/**
* Handle Text.
*/
protected void handleText(char data[]) {
if (debugFlag) {
debug("text: ->" + new String(data) );
}
}
/*
* Error handling.
*/
protected void handleError(int ln, String errorMsg) {
}
/*
* debug messages
*/
private void debug(String msg) {
System.out.println(msg);
}
public static void main(String[] args) throws Exception{
String str="ntttnAAAn";
StringReader reader = new StringReader(str);
new DocumentParser(DTD.getDTD("html40")).parse(reader, false);
}
}
|
知道javacc吗?做html解析器在合适不过了。而且javacc本身就带了一个html parser的例子。现成的东西 :)
http://www.webgain.com/products/java_cc/
ok
http://www.webgain.com/products/java_cc/
ok
|
先给你一个类似简单的,你继续升华:
我做的是直接访问网络的miniBrowser,你可以修改一下变成读取String ;
import java.net.*;
import java.io.*;
/**
*This is a miniature line-mode browser. It will connect to a URL provided on the command line, and return the contents of the page.
*
*Format:
*
*java miniBrowse URL raw|cooked
*
*For example:
*java miniBrowse http://www.zju.edu.cn
*
*Raw mode will just dump the HTML datastream back to the screen. Cooked mode does some rudimentary parsing.
*It defaults to "cooked" mode, just to not clutter up the screen as much.
*
*@author zhouqi registerno:20221328
*
*@version 1.0.0.0
*
*/
public class miniBrowse
{
static URL addr = null;
static String method = null;
/**
*@param pageURL passed from the main function input param as args[0]
*/
miniBrowse(String pageURL)
{
BufferedReader in = null;
PrintWriter out = new PrintWriter(new BufferedWriter(
new OutputStreamWriter(
System.out)));
try
{
addr = new URL(/tech-qa-java/pageURL/index.html);
}
catch (MalformedURLException e)
{
out.println("Invalid URL.");
System.exit(4);
}
try
{
URLConnection page = addr.openConnection();
in = new BufferedReader(new InputStreamReader(page.getInputStream()));
if (method.equalsIgnoreCase("cooked"))
cookedOutput(in, out);
else
rawOutput(in, out);
}
catch (Exception e)
{
}
}
public void cookedOutput(BufferedReader in, PrintWriter out)
{
int chr = ' ';
int pchr = ' ';
boolean skip = true;
String tag = "";
String list = "";
int lcount = 0;
try
{
while ((chr = in.read()) != -1)
{
/*
found an HTML tag, let's extract it
*/
if (chr == '')
{
if (chr == -1)
{
out.println("Missing '>'");
System.exit(8);
}
tag = tag + Character.toUpperCase((char)chr);
}
/*
some basic tag processing (default is skip quietly)
notice we skip the entire HEAD section!!
*/
if (tag.startsWith("BODY"))
skip = false;
if (tag.equals("BR"))
{
out.println("");
}
if (tag.equals("HR"))
{
out.println("n-------------");
}
/*
lists, ordered and unordered - new line and indent
each member (ordered list numbers members)
*/
if (tag.equals("UL") || tag.equals("OL"))
{
out.println("");
list = tag;
lcount = 0;
}
if (tag.equals("LI"))
{
if (list.equals("OL"))
out.print("n" + ++lcount);
else
out.print("n-- ");
}
if (tag.startsWith("A "))
{
out.print("");
}
if (tag.startsWith("IMG"))
{
out.print("");
}
if (tag.startsWith("APPLET"))
{
out.print("");
}
if (tag.startsWith("SCRIPT"))
{
out.print("");
}
if (tag.equals("P"))
{
out.println("nn");
}
continue;
}
if (!skip)
{
/*
don't echo excess spaces
*/
if (chr != ' ')
out.write(chr);
else
if (pchr != ' ')
out.write(chr);
pchr = chr;
}
}
out.flush();
}
catch (IOException e)
{
out.println("I/O Exception.");
}
}
public void rawOutput(BufferedReader in, PrintWriter out)
{
String inputLine;
try
{
while ((inputLine = in.readLine()) != null)
out.println(inputLine);
}
catch (IOException e)
{
out.println("I/O Exception.");
}
out.flush();
}
/**
*@param args[0] URL like as http://www.zju.edu.cn
*@param args[1] raw or cooked,default is the cooked
*
*/
public static void main(String[] args)
{
if (args.length > 1)
method = args[1];
else
method = "cooked";
if (args.length > 0)
new miniBrowse(args[0]);
}
}
我做的是直接访问网络的miniBrowser,你可以修改一下变成读取String ;
import java.net.*;
import java.io.*;
/**
*This is a miniature line-mode browser. It will connect to a URL provided on the command line, and return the contents of the page.
*
*Format:
*
*java miniBrowse URL raw|cooked
*
*For example:
*java miniBrowse http://www.zju.edu.cn
*
*Raw mode will just dump the HTML datastream back to the screen. Cooked mode does some rudimentary parsing.
*It defaults to "cooked" mode, just to not clutter up the screen as much.
*
*@author zhouqi registerno:20221328
*
*@version 1.0.0.0
*
*/
public class miniBrowse
{
static URL addr = null;
static String method = null;
/**
*@param pageURL passed from the main function input param as args[0]
*/
miniBrowse(String pageURL)
{
BufferedReader in = null;
PrintWriter out = new PrintWriter(new BufferedWriter(
new OutputStreamWriter(
System.out)));
try
{
addr = new URL(/tech-qa-java/pageURL/index.html);
}
catch (MalformedURLException e)
{
out.println("Invalid URL.");
System.exit(4);
}
try
{
URLConnection page = addr.openConnection();
in = new BufferedReader(new InputStreamReader(page.getInputStream()));
if (method.equalsIgnoreCase("cooked"))
cookedOutput(in, out);
else
rawOutput(in, out);
}
catch (Exception e)
{
}
}
public void cookedOutput(BufferedReader in, PrintWriter out)
{
int chr = ' ';
int pchr = ' ';
boolean skip = true;
String tag = "";
String list = "";
int lcount = 0;
try
{
while ((chr = in.read()) != -1)
{
/*
found an HTML tag, let's extract it
*/
if (chr == '')
{
if (chr == -1)
{
out.println("Missing '>'");
System.exit(8);
}
tag = tag + Character.toUpperCase((char)chr);
}
/*
some basic tag processing (default is skip quietly)
notice we skip the entire HEAD section!!
*/
if (tag.startsWith("BODY"))
skip = false;
if (tag.equals("BR"))
{
out.println("");
}
if (tag.equals("HR"))
{
out.println("n-------------");
}
/*
lists, ordered and unordered - new line and indent
each member (ordered list numbers members)
*/
if (tag.equals("UL") || tag.equals("OL"))
{
out.println("");
list = tag;
lcount = 0;
}
if (tag.equals("LI"))
{
if (list.equals("OL"))
out.print("n" + ++lcount);
else
out.print("n-- ");
}
if (tag.startsWith("A "))
{
out.print("");
}
if (tag.startsWith("IMG"))
{
out.print("");
}
if (tag.startsWith("APPLET"))
{
out.print("");
}
if (tag.startsWith("SCRIPT"))
{
out.print("");
}
if (tag.equals("P"))
{
out.println("nn");
}
continue;
}
if (!skip)
{
/*
don't echo excess spaces
*/
if (chr != ' ')
out.write(chr);
else
if (pchr != ' ')
out.write(chr);
pchr = chr;
}
}
out.flush();
}
catch (IOException e)
{
out.println("I/O Exception.");
}
}
public void rawOutput(BufferedReader in, PrintWriter out)
{
String inputLine;
try
{
while ((inputLine = in.readLine()) != null)
out.println(inputLine);
}
catch (IOException e)
{
out.println("I/O Exception.");
}
out.flush();
}
/**
*@param args[0] URL like as http://www.zju.edu.cn
*@param args[1] raw or cooked,default is the cooked
*
*/
public static void main(String[] args)
{
if (args.length > 1)
method = args[1];
else
method = "cooked";
if (args.length > 0)
new miniBrowse(args[0]);
}
}
|
做个编译器撒
|
模仿xml的解析器做吧
|
你可以用sun的jaxp包,是一个专门对xml进行处理的包。
|
简单点,用递归算法行不行?遇到一个""
|
学习。
|
先把字符串中n替换为空格,在把字符串作为InputStream,生成一个html文件
本站(WWW.)旨在分享和传播互联网科技相关的资讯和技术,将尽最大努力为读者提供更好的信息聚合和浏览方式。
本站(WWW.)站内文章除注明原创外,均为转载、整理或搜集自网络。欢迎任何形式的转载,转载请注明出处。
本站(WWW.)站内文章除注明原创外,均为转载、整理或搜集自网络。欢迎任何形式的转载,转载请注明出处。