本文介绍了无法在webclient中完全提取数据的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!
问题描述
我试图从该网站提取数据,首先我试图获取类别,并从每个类别我得到子类别,然后从每个子类别我试图阅读和提取一些文本。
我正在使用webclient我面临一些奇怪的问题,有时我不会读取数据有时会读取数据。
有时我会得到30个类别,但从10个类别获得子类别有时候我会得到10个类别并使用网络客户端读取所有子类别。
如何解决这个问题?
以下是代码:
Hi , im trying to extract data from that website , first im trying to get categories and from each categories i am getting subcategories and then from each subcategory im trying to read and extract some text.
Im using webclient that im facing some strange problem that sometimes i doesnt read data sometimes it reads data.
Sometimes i get 30 categories but get subcategories from only 10 sometimes i get 10 categories and read all subcategories using web client.
how to solve this problem ?
followin is he code :
public Extract(string url)
{
client = new WebClient();
strm = client.OpenRead(url);
strrdr = new StreamReader(strm, Encoding.ASCII);
categorylines = new List<string>();
subcategorylines = new List<string>[30];
while (strrdr.Peek() > 0)
{
string line = strrdr.ReadLine();
line = line.Replace("\n", String.Empty);
line = line.Replace("\t", string.Empty);
line = line.Replace("\r", string.Empty);
line = line.Replace("\\", "");
//System.Threading.Thread.Sleep(100);
ExtractLines(line);
}
strrdr.Close();
}
public void ExtractSubcategories() {
string url = null;
string name = null;
for (int i = 0; i < Categories.Category.Count; i++)
{
foreach (var item in subcategorylines[i])
{
find1 = new Regex(@"href="".+"">", RegexOptions.IgnoreCase);
find2 = new Regex(@">.+<\/a>", RegexOptions.IgnoreCase);
m1 = find1.Match(item);
m2 = find2.Match(item);
if (m1.Success)
{
url = item.Substring(m1.Index+6, m1.Length - 8);
url = "www.codeproject.com" + url;
}
if (m2.Success)
{
name = item.Substring(m2.Index + 1, m2.Length - 5);
}
ArticleSubCategory sub = new ArticleSubCategory(name, url);
Categories.Category[i].SubCategories.Add(sub);
}
}
}
public void ExtractCategory() {
string url = null;
string name = null;
tblcategories = new DataTable();
foreach (var item in categorylines)
{
url = GetLine(@"href="".+"">", 6, 8, item);
name = GetLine(@">.+<\/a>", 1, 5, item);
Categories.Add(new ArticleCategory(name, url));
}
}
public string GetLine(string regex, int start, int end,string line)
{
find1 = new Regex(regex, RegexOptions.IgnoreCase);
m1 = find1.Match(line);
if (m1.Success)
{
return line.Substring(m1.Index+start,m1.Length-end);
}
else return null;
}
//public void ExtractArticle() { }
public void ExtractLines(string line)
{
string a;
if ((a = GetLine(categoryRegex, 0, 0, line)) != null)
{
tmp++;
categorylines.Add(a);
subcategorylines[tmp] = new List<string>();
}
if((a=GetLine(subcategoryRegex,0,0,line))!=null)
{
subcategorylines[tmp].Add(a);
}
//find1= new Regex(categoryRegex, RegexOptions.IgnoreCase);
//find2= new Regex(subcategoryRegex, RegexOptions.IgnoreCase);
//m1 = find1.Match(line);
//m2= find2.Match(line);
//if (m1.Success)
//{
// tmp = tmp + 1;
// categorylines.Add(m1.Value);
// subcategorylines[tmp] = new List<string>();
//}
//else
//{
// if (m2.Success)
// {
// subcategorylines[tmp].Add(m2.Value);
// }
//}
}
public void ExtractArticleMeta() {
tmp = 0;
string a;
foreach (var item in Categories.Category)
{
articleLines[tmp] = new List<string>();
foreach (var subcat in item.SubCategories)
{
client = new WebClient();
strm = client.OpenRead("http://"+subcat.Url);
strrdr = new StreamReader(strm, Encoding.ASCII);
while (strrdr.Peek()>0)
{
string line = strrdr.ReadLine();
line = line.Replace("\n", String.Empty);
line = line.Replace("\t", string.Empty);
line = line.Replace("\r", string.Empty);
line = line.Replace("\\", "");
if ((a= GetLine(articleregex,0,0,line))!=null)
{
articleLines[tmp].Add(a);
}
}
}
tmp = tmp + 1;
}
}
推荐答案
这篇关于无法在webclient中完全提取数据的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!