

1 string wordPath = Server.MapPath("/Fileword/" + FileUpload1.FileName); 2 string htmlPath = Server.MapPath("/Fileword/测试.html"); 3 //上传word文件 4 FileUpload1.SaveAs(wordPath); 5 #region 文件格式转换 6 //请引用Microsoft.Office.Interop.Word 7 ApplicationClass word = new ApplicationClass(); 8 Type wordType = word.GetType(); 9 Documents docs = word.Documents; 10 // 打开文件 11 Type docsType = docs.GetType(); 12 object fileName = wordPath; 13 Document doc = (Document)docsType.InvokeMember("Open", BindingFlags.InvokeMethod, null, (object)docs, new Object[] { fileName, true, true }); 14 //判断与文件转换相关的文件是否存在,存在则删除。(这里,最好还判断一下存放文件的目录是否存在,不存在则创建) 15 if (File.Exists(htmlPath)) { File.Delete(htmlPath); } 16 ////每一个html文件,有一个对应的存放html相关元素的文件夹(html文件名.files) 17 if (Directory.Exists(htmlPath.Replace(".html", ".files"))) 18 { 19 Directory.Delete(htmlPath.Replace(".html", ".files"), true); 20 }; 21 //转换格式,调用word的“另存为”方法 22 Type docType = doc.GetType(); 23 object saveFileName = htmlPath; 24 docType.InvokeMember("SaveAs", BindingFlags.InvokeMethod, null, doc, new object[] { saveFileName, WdSaveFormat.wdFormatHTML }); 25 // 退出 Word 26 wordType.InvokeMember("Quit", BindingFlags.InvokeMethod, null, word, null);
这样生成的html , 会有很多冗余HTML,去除冗余HTML方法如下


1 public static string CleanWordHtml(string html) 2 { 3 StringCollection sc = new StringCollection(); 4 sc.Add(@"<!--(\w|\W)+?>"); 5 sc.Add(@"<!--(\w|\W)+?-->"); 6 sc.Add(@"<style>(\w|\W)+?</style>"); 7 sc.Add(@"\s?class=\w+"); 8 sc.Add(@"<(meta|link|/?o:|/?font|/?strong|/?st\d|/?head|/?html|body|/?body|/?w:|/?m:|/?v:|!\[)[^>]*?>"); 9 sc.Add(@"(<[^>]+>)+ ()+"); 10 sc.Add(@"<xml>(\w|\W)+?</xml>");//清除xml标签及所有值 11 sc.Add(@"(\n\r){2,}"); 12 foreach (string s in sc) 13 { 14 html = Regex.Replace(html, s, "", RegexOptions.IgnoreCase); 15 } 16 17 foreach (Match match in Regex.Matches(html, "style='[^']+'", RegexOptions.IgnoreCase)) 18 { 19 html = html.Replace(match.Value, match.Value.Replace('"', ' ').Replace("'","\"")); 20 21 } 22 html = Regex.Replace(html, @"(?<=style=['""])[^'""]*(?=['""])", delegate(Match m) 23 { 24 return string.Join(";", m.Value.Split(';').Where(t => Regex.IsMatch(t.Trim(), @"^(background|color):")).ToArray()); 25 }); 26 27 return html; 28 29 }
可根据自己的需求修改正则