package com.uet.common.utils; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.OutputKeys; import javax.xml.transform.Transformer; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.dom.DOMSource; import javax.xml.transform.stream.StreamResult; import org.apache.commons.io.output.ByteArrayOutputStream; import org.apache.commons.lang3.StringUtils; import org.apache.poi.hwpf.HWPFDocument; import org.apache.poi.hwpf.converter.PicturesManager; import org.apache.poi.hwpf.converter.WordToHtmlConverter; import org.apache.poi.hwpf.usermodel.PictureType; import org.w3c.dom.Document; public class WordUtils { public static String CODING = "GB2312"; static { // 不能漏掉这个,不然jmagick.jar的路径找不到 String osName = System.getProperty("os.name").toLowerCase(); if (osName.indexOf("windows") >= 0) { CODING = "UTF-8"; } } /** * 获得word转为html的内容 * * @param wordFile * @param fileTemp * eg:/tmp/words/ * @throws TransformerException * @throws IOException * @throws ParserConfigurationException */ public static String convert2Html(File wordFile, final String fileTemp) throws TransformerException, IOException, ParserConfigurationException { HWPFDocument wordDocument = new HWPFDocument(new FileInputStream(wordFile)); //wordDocument=new XWPFDocument(new FileInputStream(wordFile)); WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument()); wordToHtmlConverter.setPicturesManager(new PicturesManager() { public String savePicture(byte[] content, PictureType pictureType, String suggestedName, float widthInches, float heightInches) { String imgagePath = fileTemp + suggestedName.toLowerCase(); File file = new File(imgagePath); FileOutputStream fos = null; try { fos = new FileOutputStream(file); fos.write(content); fos.close(); } catch (Exception e) { e.printStackTrace(); } return fileTemp + suggestedName.toLowerCase(); } }); wordToHtmlConverter.processDocument(wordDocument); Document htmlDocument = wordToHtmlConverter.getDocument(); ByteArrayOutputStream out = new ByteArrayOutputStream(); DOMSource domSource = new DOMSource(htmlDocument); StreamResult streamResult = new StreamResult(out); TransformerFactory tf = TransformerFactory.newInstance(); Transformer serializer = tf.newTransformer(); serializer.setOutputProperty(OutputKeys.ENCODING, CODING); serializer.setOutputProperty(OutputKeys.INDENT, "yes"); serializer.setOutputProperty(OutputKeys.METHOD, "html"); serializer.transform(domSource, streamResult); out.close(); String content = new String(out.toByteArray()); content=clean(content); return content; } private static String clean(String content){ content=StringUtils.replace(content, "QUOTE", ""); content=StringUtils.replace(content, "\\* MERGEFORMAT", ""); return content; } }
这个代码,从网上来,但在poi的源码里有个WordToHtmlConverter的main方法,是一样。
有的时候,图片也无法读取,有时候分数的横线无法显示。