用C＃做一个网页数据采集工具

技术2022-05-19 36

//提取产品列表页中产品最终页的网页 private void button1_Click(object sender, EventArgs e) { if (textBox1.Text.Trim() == "" || textBox2.Text.Trim() == "") { MessageBox.Show("网址和域名不能为空！", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information); return; } try { string Html = inc.GetHtml("http://www.shaoqun.com/"); //ArrayList al = inc.GetMatchesStr(Html, "<a[^>]*?>.*?</a>"); ArrayList al = inc.GetMatchesStr(Html, @"href/s*=/s*(?:[/'/""/s](?<1>[^/""/']*)[/'/""])");//提取链接

StringBuilder sb = new StringBuilder(); foreach (object var in al) { string a = var.ToString().Replace("/"", "").Replace("'", ""); a = Regex.Replace(a, "href=", "", RegexOptions.IgnoreCase | RegexOptions.Multiline); if (a.StartsWith("/")) a = textBox2.Text.Trim() + a; if (!a.StartsWith("http://")) a = "http://" + a; sb.Append(a + "/r/n"); } textBox5.Text = sb.ToString();//把提取到网址输出到一个textBox，每个链接占一行

MessageBox.Show("共提取" + al.Count.ToString() + "个链接", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);

} catch (Exception err) { MessageBox.Show("提取出错！原因：" + err.Message, "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information); }

}

//把采集的产品页面html代码进行字符串处理，提取需要的代码，最后保存到本地一个access数据库中，同时提取产品图片地址并自动现在图片到本地images文件夹下 private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e) { //填充产品表 Database.ExecuteNonQuery("delete from Tb_Product"); DataTable dt2 = new DataTable(); OleDbConnection conn = new OleDbConnection(Database.ConnectionStrings); OleDbDataAdapter da = new OleDbDataAdapter("select * from Tb_Product", conn); OleDbCommandBuilder cb = new OleDbCommandBuilder(da); da.Fill(dt2); dt2.Rows.Clear();

BackgroundWorker worker = (BackgroundWorker)sender;//这个是做一个进度条

string[] Urls = textBox5.Text.Trim().ToLower().Replace("/r/n", ",").Split(','); DataTable dt = new DataTable(); StringBuilder ErrorStr = new StringBuilder(); string html = "", ImageDir = AppDomain.CurrentDomain.BaseDirectory + "Images//";

//循环每次采集网址 for (int i = 0; i < Urls.Length; i++) { try { if (!worker.CancellationPending) { if (Urls[i] == "") return; html = inc.GetHtml(Urls[i]);//获取该url的html代码 DataRow NewRow = dt2.NewRow();

//产品名 string ProductName = html.Substring(html.IndexOf("<title>") + 7); NewRow["ProductName"] = ProductName.Remove(ProductName.IndexOf("</title>")).Trim();

//产品编号 NewRow["ModelId"] = NewRow["ProductName"].ToString().Substring(NewRow["ProductName"].ToString().IndexOf("Model:") + 6).Trim();

//产品介绍，这些都是根据不同网站的html做相应的修改 string Introduce = html.Substring(html.IndexOf("Product Details") + 26); Introduce = Introduce.Remove(Introduce.IndexOf("</table>") + 8).Trim()

NewRow["Introduce"] = Introduce;

//下载图片 string ProductImage = html.Substring(html.IndexOf("align=center><img") + 17); ProductImage = textBox2.Text.Trim() + ProductImage.Substring(ProductImage.IndexOf("src=/"") + 5); ProductImage = ProductImage.Remove(ProductImage.IndexOf("/"")); try { inc.DownFile(ProductImage, ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1)); } catch (Exception) { ErrorStr.Append("下载图片失败，图片地址：" + ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1) + "/r/n"); }

dt2.Rows.Add(NewRow);

//Thread.Sleep(100); worker.ReportProgress((i + 1) * 100 / Urls.Length, i); toolStripStatusLabel1.Text = "处理进度:" + (i + 1).ToString() + "/" + Urls.Length.ToString();//进度条 }

} catch (Exception err) { ErrorStr.Append("采集错误：" + err.Message + ";网址：" + Urls[i] + "/r/n"); } } da.Update(dt2); DataBind(dt2); ShowError(ErrorStr.ToString()); }

/// <summary> /// ASPX页面生成静态Html页面

/// </summary> public static string GetHtml(string url) { StreamReader sr = null; string str = null; //读取远程路径 WebRequest request = WebRequest.Create(url); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(response.CharacterSet)); str = sr.ReadToEnd(); sr.Close(); return str; }

// 提取HTML代码中的网址 public static ArrayList GetMatchesStr(string htmlCode, string strRegex) { ArrayList al = new ArrayList();

Regex r = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline); MatchCollection m = r.Matches(htmlCode);

for (int i = 0; i < m.Count; i++) { bool rep = false; string strNew = m[i].ToString();

// 过滤重复的URL foreach (string str in al) { if (strNew == str) { rep = true; break; } }

if (!rep) al.Add(strNew); }

al.Sort();

return al; }

public static void DownFile(string Url, string Path) {

HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); Stream stream = response.GetResponseStream(); long size = response.ContentLength; //创建文件流对象 using (FileStream fs = new FileStream(Path, FileMode.OpenOrCreate, FileAccess.Write)) { byte[] b = new byte[1025]; int n = 0; while ((n = stream.Read(b, 0, 1024)) > 0) { fs.Write(b, 0, n); } } }

专利

最新回复(0)