用Flying Saucer+Jsoup生成个人博客文集
2014-07-27
在wytk2008.net写了3年的博客,积累了102篇博文。想想大学毕业后可以做一个小结,一方面把文章作个备份,另一方面想想工作以后可以作些什么改变,比如我正打算把博客迁移到Github或者阿里云的服务器上,加快访问速度和减少维护成本。
原先打算用Latex生成漂亮的文章,但是没有发现合适的API可以用,如果要较完整地保留原博文的格式、图片等信息,自己写一个从HTML到Latex的转码甚是耗时。于是辗转发现一个叫Flying Saucer的Java库,可以很好地将带CSS的网页“打印”成PDF,于是花了两天的时间把博客的文章拉取下来制成了一个文集,预览地址:Book
####文章爬取
整个过程包括文章爬取,内容提取和生成PDF,均通过Java完成。个人java还比较渣,写得不太好。
package crawler_wordpress;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.*;
import java.net.SocketTimeoutException;
import java.sql.SQLException;
public class FirstCrawler {
public static void main(String[] args) throws SQLException, IOException
{
getPage("http://www.wytk2008.net/life/1304/");
}
public static void getPage(String URL) throws SQLException,IOException,SocketTimeoutException
{
boolean succ = false;
while(!succ)
{
try{
processPage(URL);
succ = true;
}
catch(Exception e){
System.out.println("Main() - Time out Exception.");
}
}
}
public static void processPage(String URL) throws SQLException,IOException,SocketTimeoutException
{
//获取网页
Document doc = Jsoup.connect(URL).get();
//提取标签,获取主内容块
Element content = doc.getElementById("main");
Elements links = content.getElementsByTag("a");
//提取文章正文
Elements articles = content.getElementsByTag("article");
Element article = null;
for (Element a : articles)
{
article = a;
break;
}
//提取评论内容
Elements comments = content.getElementsByClass("comment-list");
Element comment_list = null;
for (Element comment : comments)
{
comment_list = comment;
}
//处理多余的元素
try{
//删除修改时间
article.select("time.updated").remove();
//删除目录块
article.select("footer.entry-meta").remove();
//删除评论中的头像
comment_list.select("img[src]").remove();
//删除回复按钮
comment_list.select("div.reply").remove();
}
catch(Exception e)
{
System.out.println("Nothing to remove.");
}
String post_id = article.attr("id");
String post_name = article.select("h1.entry-title").text();
System.out.print("Crawling "+post_id+" ");
System.out.println(post_name);
//处理不可用于文件名的非法字符
if (post_name.contains(":"))
{
post_name = post_name.split(":")[0];
}
//HTML头部
String header = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" +
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN\" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">\n" +
"<html xmlns=\"http://www.w3.org/1999/xhtml\">\n" +
"<head>\n" +
" <title>"+post_name+"</title>\n" +
" <style type=\"text/css\"> body {font-family:Georgia,SimSun;} p {line-height:25px; text-indent:2em;} "+
"img {text-align:center;max-width:640px; height:auto;width:expression(this.width > 640 ? \"640px\" : this.width);} " +
"a { text-decoration:none; } h1 {font-family:SimHei;} ul li{ margin:0; padding-top:5px;} </style>\n" +
"</head>";
String outputs;
if (comment_list != null)
{
outputs = header +
"<body>\n" +
article +
comment_list +
"</body></html>";
}
else
{
outputs = header +
"<body>\n" +
article +
"</body></html>";
}
String fileName = "src/pages/"+post_id+"-"+post_name+".html";
FileOutputStream f = new FileOutputStream(fileName);
OutputStreamWriter out = new OutputStreamWriter(f,"UTF-8");
out.write(outputs);
out.flush();
out.close();
//获取前一篇文章的URL
for (Element link : links)
{
String linkHref = link.attr("href");
String linkrel = link.attr("rel");
if (linkrel.equals("prev"))
{
System.out.println("Next Article URL: "+linkHref);
getPage(linkHref);
}
}
}
}
####PDF生成
首先获取目录中HTML的文件名,对于每个HTML文件,用render进行渲染输出。注意此处应该添加中文字体支持。另外,原作者的library,并不能很好地支持中文换行,有人对此做了修改,网上可以下载到改好的包。
package oliver.itext.html2pdf;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;
import com.lowagie.text.pdf.BaseFont;
import org.xhtmlrenderer.pdf.ITextFontResolver;
import org.xhtmlrenderer.pdf.ITextRenderer;
import com.lowagie.text.DocumentException;
import org.xhtmlrenderer.pdf.TrueTypeUtil;
public class FirstDoc
{
private static ITextRenderer render = new ITextRenderer();
public static void main(String[] args) throws DocumentException, IOException
{
String path = System.getProperty("user.dir") + "/src/";
List<String> fileName_list = getFileNames(path);
for (String fileName : fileName_list) {
System.out.println("Processing "+fileName);
String inputFile = path + "pages/" + fileName + ".html";
String url = new File(inputFile).toURI().toURL().toString();
String outputFile = path + "outputs/" + fileName + ".pdf";
OutputStream os = new FileOutputStream(outputFile);
render.setDocument(url);
ITextFontResolver fontResolver = render.getFontResolver();
//添加字体:宋体、黑体
fontResolver.addFont("C:/Windows/Fonts/simsun.TTC", BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);
fontResolver.addFont("C:/Windows/Fonts/simhei.ttf", BaseFont.IDENTITY_H, BaseFont.NOT_EMBEDDED);
render.layout();
render.createPDF(os);
os.close();
}
}
//获取目录下所有文件名
private static List<String> getFileNames(String path) throws IOException {
List<String> names = new ArrayList<String>();
File file = new File(path + "/pages");
File[] lf = file.listFiles();
for (int i = 0; i < lf.length; i++) {
String filename = lf[i].getName();
if (filename.endsWith("html")) {
String name = filename.replace(".html", "");
names.add(name);
}
}
return names;
}
}