用C#做一个网页数据采集工具

    技术2022-05-19  28

    //提取产品列表页中产品最终页的网页        private void button1_Click(object sender, EventArgs e)        {            if (textBox1.Text.Trim() == "" || textBox2.Text.Trim() == "")            {                MessageBox.Show("网址和域名不能为空!", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);                return;            }            try            {                string Html = inc.GetHtml("http://www.shaoqun.com/");                //ArrayList al = inc.GetMatchesStr(Html, "<a[^>]*?>.*?</a>");                ArrayList al = inc.GetMatchesStr(Html, @"href/s*=/s*(?:[/'/""/s](?<1>[^/""/']*)[/'/""])");//提取链接

                    StringBuilder sb = new StringBuilder();                foreach (object var in al)                {                    string a = var.ToString().Replace("/"", "").Replace("'", "");                    a = Regex.Replace(a, "href=", "", RegexOptions.IgnoreCase | RegexOptions.Multiline);                    if (a.StartsWith("/"))                        a = textBox2.Text.Trim() + a;                    if (!a.StartsWith("http://"))                        a = "http://" + a;                    sb.Append(a + "/r/n");                }                textBox5.Text = sb.ToString();//把提取到网址输出到一个textBox,每个链接占一行

     

                    MessageBox.Show("共提取" + al.Count.ToString() + "个链接", "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);

                }            catch (Exception err)            {                MessageBox.Show("提取出错!原因:" + err.Message, "信息提示", MessageBoxButtons.OK, MessageBoxIcon.Information);            }

            }

     

     //把采集的产品页面html代码进行字符串处理,提取需要的代码,最后保存到本地一个access数据库中,同时提取产品图片地址并自动现在图片到本地images文件夹下        private void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e)        {            //填充产品表            Database.ExecuteNonQuery("delete from Tb_Product");            DataTable dt2 = new DataTable();            OleDbConnection conn = new OleDbConnection(Database.ConnectionStrings);            OleDbDataAdapter da = new OleDbDataAdapter("select * from Tb_Product", conn);            OleDbCommandBuilder cb = new OleDbCommandBuilder(da);            da.Fill(dt2);            dt2.Rows.Clear();

                BackgroundWorker worker = (BackgroundWorker)sender;//这个是做一个进度条

                string[] Urls = textBox5.Text.Trim().ToLower().Replace("/r/n", ",").Split(',');            DataTable dt = new DataTable();            StringBuilder ErrorStr = new StringBuilder();            string html = "", ImageDir = AppDomain.CurrentDomain.BaseDirectory + "Images//";

                //循环每次采集网址            for (int i = 0; i < Urls.Length; i++)            {                try                {                    if (!worker.CancellationPending)                    {                        if (Urls[i] == "")                            return;                        html = inc.GetHtml(Urls[i]);//获取该url的html代码                     DataRow NewRow = dt2.NewRow();

                            //产品名                        string ProductName = html.Substring(html.IndexOf("<title>") + 7);                        NewRow["ProductName"] = ProductName.Remove(ProductName.IndexOf("</title>")).Trim();

                            //产品编号                        NewRow["ModelId"] = NewRow["ProductName"].ToString().Substring(NewRow["ProductName"].ToString().IndexOf("Model:") + 6).Trim();

                            //产品介绍,这些都是根据不同网站的html做相应的修改                        string Introduce = html.Substring(html.IndexOf("Product Details") + 26);                        Introduce = Introduce.Remove(Introduce.IndexOf("</table>") + 8).Trim()

                            NewRow["Introduce"] = Introduce;

     

     

                                //下载图片                            string ProductImage = html.Substring(html.IndexOf("align=center><img") + 17);                            ProductImage = textBox2.Text.Trim() + ProductImage.Substring(ProductImage.IndexOf("src=/"") + 5);                            ProductImage = ProductImage.Remove(ProductImage.IndexOf("/""));                            try                            {                                inc.DownFile(ProductImage, ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1));                            }                            catch (Exception)                            {                                ErrorStr.Append("下载图片失败,图片地址:" + ImageDir + ProductImage.Substring(ProductImage.LastIndexOf("/") + 1) + "/r/n");                            }

                            dt2.Rows.Add(NewRow);

                            //Thread.Sleep(100);                        worker.ReportProgress((i + 1) * 100 / Urls.Length, i);                        toolStripStatusLabel1.Text = "处理进度:" + (i + 1).ToString() + "/" + Urls.Length.ToString();//进度条                    }

                    }                catch (Exception err)                {                    ErrorStr.Append("采集错误:" + err.Message + ";网址:" + Urls[i] + "/r/n");                }            }            da.Update(dt2);            DataBind(dt2);            ShowError(ErrorStr.ToString());        }

             /// <summary>        /// ASPX页面生成静态Html页面

            /// </summary>        public static string GetHtml(string url)        {            StreamReader sr = null;            string str = null;            //读取远程路径            WebRequest request = WebRequest.Create(url);            HttpWebResponse response = (HttpWebResponse)request.GetResponse();            sr = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding(response.CharacterSet));            str = sr.ReadToEnd();            sr.Close();            return str;        }

         // 提取HTML代码中的网址         public static ArrayList GetMatchesStr(string htmlCode, string strRegex)        {            ArrayList al = new ArrayList();

                Regex r = new Regex(strRegex, RegexOptions.IgnoreCase | RegexOptions.Multiline);            MatchCollection m = r.Matches(htmlCode);

                for (int i = 0; i < m.Count; i++)            {                bool rep = false;                string strNew = m[i].ToString();

                    // 过滤重复的URL                 foreach (string str in al)                {                    if (strNew == str)                    {                        rep = true;                        break;                    }                }

                    if (!rep) al.Add(strNew);            }

                al.Sort();

                return al;        }

            public static void DownFile(string Url, string Path)        {

                HttpWebRequest request = (HttpWebRequest)WebRequest.Create(Url);            HttpWebResponse response = (HttpWebResponse)request.GetResponse();            Stream stream = response.GetResponseStream();            long size = response.ContentLength;            //创建文件流对象            using (FileStream fs = new FileStream(Path, FileMode.OpenOrCreate, FileAccess.Write))            {                byte[] b = new byte[1025];                int n = 0;                while ((n = stream.Read(b, 0, 1024)) > 0)                {                    fs.Write(b, 0, n);                }            }        }


    最新回复(0)