package org.apache.nutch.parse.html;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 分析时间戳
 *
 * @author xum
 *
 */
public class PublishTimeExtract {

private static final String TIME_REGEX = "(:|>|\\s)?20[0-9]{2}(-|/|\\.|\\u5e74)\\d{1,2}(-|/|\\.|\\u6708)\\d{1,2}(\\u65e5)?.\\d{2}(:|\\u65f6)\\d{2}((:|\\u5206)\\d{2})?";
 private static Pattern pattern = Pattern.compile(TIME_REGEX);
 private static SimpleDateFormat sdf = new SimpleDateFormat(
   "yyyy-MM-dd HH:mm:ss");
 private static final String BBS_URL = "(http://bbs/\\..*|http://www\\.tianya\\.cn/[a-zA-Z]*forum/content/.*)";

/**
  * @param content
  * @param url
  * @return
  */
 public static String extractDate(String content, String url) {

Matcher m = pattern.matcher(content);
  Date now = new Date();

// BBS分析最后一个发表时间
  if (url.matches(BBS_URL)) {

String dateStr = null;

Date date = null;

while (m.find()) {

dateStr = m.group();

if (dateStr == null)
     continue;

dateStr = dateStr.trim().replaceAll(">", "");

if (dateStr.startsWith(":")) {
     dateStr = dateStr.replaceFirst(":", "");
    }

dateStr = dateStr.replaceAll("\\.|/|\\u5e74|\\u6708|\\u65e5",
      "-");
    dateStr = dateStr.replaceAll("\\u65f6|\\u5206", ":");

Date tempDate;

try {
     tempDate = sdf.parse(dateStr);

if (tempDate.after(now)) {
      continue;
     }

} catch (ParseException e) {
     continue;
    }

if (date == null) {
     date = tempDate;
    } else if (tempDate.after(date)) {
     date = tempDate;
    }
   }

if (date != null) {

return (date.getTime() + (long) 8 * 3600 * 1000) + "";
   }

} else { // 新闻网页分析第一个出现的时间

String dateStr = null;

if (m.find()) {
    dateStr = m.group();
   }

if (dateStr != null) {

dateStr = dateStr.trim().replaceAll(">", "");

if (dateStr.startsWith(":")) {
     dateStr = dateStr.replaceFirst(":", "");
    }

dateStr = dateStr.replaceAll("\\.|/|\\u5e74|\\u6708|\\u65e5",
      "-");
    dateStr = dateStr.replaceAll("\\u65f6|\\u5206", ":");

try {

return (sdf.parse(dateStr).getTime() + (long) 8 * 3600 * 1000)
       + "";

} catch (ParseException e) {
     return ((new Date()).getTime() + (long) 8 * 3600 * 1000)
       + "";
    }
   }
  }

return ((new Date()).getTime() + (long) 8 * 3600 * 1000) + "";
 }

}