C#代码
C#时间戳转换
using System;
using System.Collections.Generic;
using SpiderInterface;
class LocoyCode{
/// <summary>
/// 执行方法,不能修改类和方法名称。
/// </summary>
/// <param name="content">标签内容</param>
/// <param name="response">页面响应,包含了Url、原始Html等属性</param>
/// <returns>返回处理后的标签内容</returns>
public string Run(string content,ResponseEntry response){
//在这里编写处理代码
string timeStamp = content; //将内容标签的时间戳赋值给临时字符串变量 //创建字符串存储时间戳
DateTime newTime; //存放新的时间
DateTime dateTimeStart = TimeZone.CurrentTimeZone.ToLocalTime(new DateTime(1970, 1, 1)); //创建1970.1.1日期
long lTime = long.Parse(timeStamp + "0000000"); //现在时间戳加上起始为0000000的时间戳
TimeSpan toNow = new TimeSpan(lTime); //TimeSpan实例变量toNOW
newTime = dateTimeStart.Add(toNow); //将起始时间戳加上现在的时间戳转化为正常时间
content = newTime.ToString("yyyy-MM-dd"); //将转化成正常的时间进行字符格式化
return content;
}
}
采集器处理代码
将base64编码的装换为utf-8编码的内容
using System;
using System.Text;
using System.Collections.Generic;
using SpiderInterface;
class LocoyCode{
/// <summary>
/// 执行方法,不能修改类和方法名称。
/// </summary>
/// <param name="content">标签内容</param>
/// <param name="response">页面响应,包含了Url、原始Html等属性</param>
/// <returns>返回处理后的标签内容</returns>
public string Run(string content,ResponseEntry response){
//在这里编写处理代码
if(content.Length > 1000){
var a = Convert.FromBase64String(content);
content = Encoding.UTF8.GetString(a);
}
return content;
}
}
时间转换处理
using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using SpiderInterface;
class LocoyCode{
/// <summary>
/// 执行方法,不能修改类和方法名称。
/// </summary>
/// <param name="content">标签内容</param>
/// <param name="response">页面响应,包含了Url、原始Html等属性</param>
/// <returns>返回处理后的标签内容</returns>
static string OneBitNumberToChinese(string num){
string chineseStr = "123456789";
string numStr = "一二三四五六七八九";
string result = "";
int numIndex = numStr.IndexOf(num);
if(numIndex>-1){
char[] chs = chineseStr.ToCharArray();
result = chs[numIndex].ToString();
}
return result;
}
public string Run(string content,ResponseEntry response){
//在这里编写处理代码
string lastTime = "";
int intTime = 0;
DateTime time;
time = DateTime.Now;
Regex regex = new Regex(@"^(\d{1,2})");
if (content.Contains("刚刚")){
lastTime = time.ToString("D");
}else if (content.Contains("分钟前")){
string num = regex.Match(content).Groups[0].Value;
num = OneBitNumberToChinese(num);
num = "-" + num;
intTime = Convert.ToInt16(num);
lastTime = time.AddMinutes(intTime).ToString("D");
}else if (content.Contains("小时前")){
string num = regex.Match(content).Groups[0].Value;
num = OneBitNumberToChinese(num);
num = "-" + num;
intTime = Convert.ToInt16(num);
lastTime = time.AddHours(intTime).ToString("D");
}else if (content.Contains("天前")){
string num = regex.Match(content).Groups[0].Value;
num = OneBitNumberToChinese(num);
num = "-" + num;
intTime = Convert.ToInt16(num);
lastTime = time.AddDays(intTime).ToString("D");
}else {
lastTime = content;
}
return lastTime;
}
}
C#时间格式转换(将几分钟、几小时、几天前、几个月前转换为正常时间)
using System;
using System.Collections.Generic;
using System.Text.RegularExpressions;
using SpiderInterface;
class LocoyCode{
/// <summary>
/// 执行方法,不能修改类和方法名称。
/// </summary>
/// <param name="content">标签内容</param>
/// <param name="response">页面响应,包含了Url、原始Html等属性</param>
/// <returns>返回处理后的标签内容</returns>
public string Run(string content,ResponseEntry response){
string lastTime = content ;
int intTime = 0;
DateTime time;
time = DateTime.Now;
Regex regex = new Regex(@"^(\d{1,2})");
if (content.Contains("小时") || content.Contains("分钟")){
lastTime = time.ToString("D");
}else if (content.Contains("天前")){
string num = regex.Match(content).Groups[0].Value;
num = "-" + num;
intTime = Convert.ToInt16(num);
lastTime = time.AddDays(intTime).ToString("D");
}else if (content.Contains("月前")){
string num = regex.Match(content).Groups[0].Value;
num = "-" + num;
intTime = Convert.ToInt16(num) * 30;
lastTime = time.AddDays(intTime).ToString("D");
}
return lastTime;
}
}
C#获取系统时间
using System;
using System.Collections.Generic;
using SpiderInterface;
class LocoyCode{
/// <summary>
/// 执行方法,不能修改类和方法名称。
/// </summary>
/// <param name="content">标签内容</param>
/// <param name="response">页面响应,包含了Url、原始Html等属性</param>
/// <returns>返回处理后的标签内容</returns>
public string Run(string content,ResponseEntry response){
//在这里编写处理代码
if(content != ""){
return content;
}
string date = DateTime.Now.ToString();
return date;
}
}
C#提取字符串中的日期并且格式化
using System;
using System.Collections.Generic;
using SpiderInterface;
class LocoyCode{
/// <summary>
/// 执行方法,不能修改类和方法名称。
/// </summary>
/// <param name="content">标签内容</param>
/// <param name="response">页面响应,包含了Url、原始Html等属性</param>
/// <returns>返回处理后的标签内容</returns>
public string Run(string content,ResponseEntry response){
//在这里编写处理代码
string result = content.Substring(0, 4)+ "-" + content.Substring(4,2) + "-" + content.Substring(6,2);
return result;
}
}
时间连接(如20210520)在一起的解决方法:
usingSystem;
usingSystem.Collections.Generic;
usingSystem.Text.RegularExpressions;
usingSpiderInterface;
classLocoyCode{
///<summary>
///执行方法,不能修改类和方法名称。
///</summary>
///<paramname="content">标签内容</param>
///<paramname="response">页面响应,包含了Url、原始Html等属性</param>
///<returns>返回处理后的标签内容</returns>
public string Run(string content,ResponseEntry response){
Regex regex = new Regex(@"^(\d{4})(\d{2})(\d{2})$");
string str1 = regex.Match(content).Groups[1].Value;
string str2 = regex.Match(content).Groups[2].Value;
string str3 = regex.Match(content).Groups[3].Value;
string str = "";
str = str1 + "-" + str2 + "-" + str3;
return str;
}
}
时间戳代码
using System;
using System.Collections.Generic;
using SpiderInterface;
class LocoyCode{
/// <summary>
/// 执行方法,不能修改类和方法名称。
/// </summary>
/// <param name="content">标签内容</param>
/// <param name="response">页面响应,包含了Url、原始Html等属性</param>
/// <returns>返回处理后的标签内容</returns>
public string Run(string content,ResponseEntry response){
//在这里编写处理代码
string timeStamp = content; //将内容标签的时间戳赋值给临时字符串变量 //创建字符串存储时间戳
DateTime newTime; //存放新的时间
DateTime dateTimeStart = TimeZone.CurrentTimeZone.ToLocalTime(new DateTime(1970, 1, 1)); //创建1970.1.1日期
long lTime = long.Parse(timeStamp + "0000000"); //现在时间戳加上起始为0000000的时间戳
TimeSpan toNow = new TimeSpan(lTime); //TimeSpan实例变量toNOW
newTime = dateTimeStart.Add(toNow); //将起始时间戳加上现在的时间戳转化为正常时间
content = newTime.ToString("yyyy-MM-dd HH:mm:ss"); //将转化成正常的时间进行字符格式化
return content;
}
}
内容长度处理
using System;
using System.Text.RegularExpressions;
using System.Collections.Generic;
using SpiderInterface;
class LocoyCode{
/// <summary>
/// 执行方法,不能修改类和方法名称。
/// </summary>
/// <param name="content">标签内容</param>
/// <param name="response">页面响应,包含了Url、原始Html等属性</param>
/// <returns>返回处理后的标签内容</returns>
public string Run(string content,ResponseEntry response){
//在这里编写处理代码
string text = Regex.Replace(content, @"\W|[a-z]|[A-Z]|_", "");
if(text.Length < 20){
content = "";
}
return content;
}
}
附件的切割与拼接
using System;
using System.Collections.Generic;
using SpiderInterface;
class LocoyCode{
/// <summary>
/// 执行方法,不能修改类和方法名称。
/// </summary>
/// <param name="content">标签内容</param>
/// <param name="response">页面响应,包含了Url、原始Html等属性</param>
/// <returns>返回处理后的标签内容</returns>
public string Run(string content,ResponseEntry response){
//在这里编写处理代码
//string[] splitData = Regex.Split(content, "&&");
string[] splitData = content.Split(new string[] {"&&"}, StringSplitOptions.None);
string[] urlArr = splitData[0].Split('|');
string[] nameArr = splitData[1].Split('|');
string result = "";
for(int i = 0; i<urlArr.Length; i++){
result += "<a href=\"" + urlArr[i] + "\">" + nameArr[i] +"</a>";
}
return result;
}
}
C#解决附件需要分隔然后在拼接的问题
using System;
using System.Collections.Generic;
using SpiderInterface;
class LocoyCode{
/// <summary>
/// 执行方法,不能修改类和方法名称。
/// </summary>
/// <param name="content">标签内容</param>
/// <param name="response">页面响应,包含了Url、原始Html等属性</param>
/// <returns>返回处理后的标签内容</returns>
public string Run(string content,ResponseEntry response){
//在这里编写处理代码
string result="";
string[] list_data = content.Split(new string[] {"\n", "\t"}, StringSplitOptions.RemoveEmptyEntries);
foreach(string str in list_data){
result += "<a href='"+str.Replace("./", "")+"'>附件</a>";
}
return result;
}
}
C#统一去除html臃肿代码
public static string ModifyHtml(string text)
{
if (string.IsNullOrEmpty(text))
{ return string.Empty; }
else
{
//office标签清除
text = Regex.Replace(text, "\\\\\"", "'", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "\\\\r\\\\n", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "\\\\r", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "\\\\n", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "\\\\t", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "null", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "<divider[^>]*?>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "</divider[^>]*?>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "<iframe[^>]*?>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, "</iframe[^>]*?>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"[""'‘’”“]", "'", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<!--[if[\S\s]+?<!\[endif\]-->", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<o:p></o:p>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<style[^>]*?>[\s\S]*?</style>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<script[^>]*?>[\s\S]*?</script>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<p[^>]*?>", "<p>", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<table[^>]*?>", "<table>", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<tr[^>]*?>", "<tr>", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<ul[^>]*?>", "<ul>", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<li[^>]*?>", "<li>", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<!DOCTYPE[^>]*?>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<html[^>]*?>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"</html[^>]*?>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<meta[^>]*?>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<head[^>]*?>[\s\S]*?</head>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<body[^>]*?>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"</body[^>]*?>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<form[^>]*?>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"</form[^>]*?>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<textarea[^>]*?>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"</textarea[^>]*?>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"( ){3,}", " ", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"\{['""][^\}]*?['""]\}", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<link[^>]*?>", "", RegexOptions.IgnoreCase);
//text = Regex.Replace(text, @"<img[^>]*?>", "", RegexOptions.IgnoreCase);//有些事内容是图片
text = Regex.Replace(text, @"<rect[^>]*?>", "", RegexOptions.IgnoreCase);
//text = Regex.Replace(text, @"<input[^>]*>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<!--([\s\S]*?)-->", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"style=['""]([\s\S]*?)['""]", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"class=['""]([\s\S]*?)['""]", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<strong[^>]*?>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"</strong>", "", RegexOptions.IgnoreCase);
//text = Regex.Replace(text, @"<[/]?h[1-6][^>]*?>", "", RegexOptions.IgnoreCase);//有些信息用它来换行
text = Regex.Replace(text, @"<div[^>]*?>", "<div>", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<font[^>]*?>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"</font>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<canvas[^>]*?>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"</canvas>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<span[^>]*?>", "<span>", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<label[^>]*?>", "<label>", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<!-[^>]*?>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<[/]?h[1-9][^>]*?>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"<img[\s\S]*?src=['""]data: image[^>] *?> ", "", RegexOptions.IgnoreCase);//清除base64图片
//只能放在最后
text = Regex.Replace(text, @"<p[^>]*?><br[^>]*?/></p[^>]*?>", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"(\r\n){2,}", "\r\n", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"(\r)", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"(\n)", "", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"(\t)", "", RegexOptions.IgnoreCase);//空格
text = Regex.Replace(text, @"(<br/>){2,}", "<br />", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"(<br>){2,}", "<br />", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"(<br />){2,}", "<br />", RegexOptions.IgnoreCase);
text = Regex.Replace(text, @"(\s){2,}", " ", RegexOptions.IgnoreCase);
if (!string.IsNullOrWhiteSpace(text))
{
text = FilterInputHtml(text);
}
text = text.Trim();
return text;
}
}
评论 (0)