C# Programming
08-26-2009, 06:28 AM
Hi folks I know there are lots of different free ones out there.
But I wanted to have a go at somthing quick as a learning exercise.
I got as far as the code below, but am having trouble putting found URL's (i.e. 'eachLineValue') from a web page back into the WebRequest.Create(URL)
Do i need some sort of function around this that I can call recursively with different URL's ?
thanks Mark
using System;
using System.Collections.Generic;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Net;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
namespace ShoppingDataExtractor
{
public partial class spider : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
// used to build entire input
StringBuilder sb = new StringBuilder();
// used on each read operation
byte[] buf = new byte[8192];
string initialURL = "http://www.bestpricedirectory.com.au/";
try
{
WebRequest myRequest = WebRequest.Create(initialURL);
// Return the response.
WebResponse myResponse = myRequest.GetResponse();
Response.Write(myResponse);
// Code to use the WebResponse goes here.
Stream resStream = myResponse.GetResponseStream();
string tempString = null;
int count = 0;
do
{
// fill the buffer with data
count = resStream.Read(buf, 0, buf.Length);
// make sure we read some data
if (count != 0)
{
// translate from bytes to ASCII text
tempString = Encoding.ASCII.GetString(buf, 0, count);
// continue building the string
sb.Append(tempString);
}
}
while (count > 0); // any more data to read?
Response.Write(sb.ToString());
// regex to find all html links
// next - to use recursion to follow all links and write out
Regex r = new Regex(@"href=\""(.*?)\""",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
MatchCollection matches = r.Matches(sb.ToString());
foreach (Match match in matches)
{
Array values = match.Value.ToCharArray();
foreach (System.Char line in values)
{
//Response.Write("{0} ", line);
File.AppendAllText(@"C:/Documents and Settings/Administrator/My Documents/Visual Studio 2008/Projects/ShoppingDataExtractor/spider.txt", ({0}) ,line);
sb.ToString();
//File.AppendAllText(@"C:/Documents and Settings/Administrator/My Documents/Visual Studio 2008/Projects/ShoppingDataExtractor/spider.txt", match.Value + Environment.NewLine);
}
// Close the response to free resources.
}
myResponse.Close();
resStream.Close();
}
catch (Exception ex)
{
Response.Write(ex.ToString());
}
}
}
}
But I wanted to have a go at somthing quick as a learning exercise.
I got as far as the code below, but am having trouble putting found URL's (i.e. 'eachLineValue') from a web page back into the WebRequest.Create(URL)
Do i need some sort of function around this that I can call recursively with different URL's ?
thanks Mark
using System;
using System.Collections.Generic;
using System.Web;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Net;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
namespace ShoppingDataExtractor
{
public partial class spider : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
// used to build entire input
StringBuilder sb = new StringBuilder();
// used on each read operation
byte[] buf = new byte[8192];
string initialURL = "http://www.bestpricedirectory.com.au/";
try
{
WebRequest myRequest = WebRequest.Create(initialURL);
// Return the response.
WebResponse myResponse = myRequest.GetResponse();
Response.Write(myResponse);
// Code to use the WebResponse goes here.
Stream resStream = myResponse.GetResponseStream();
string tempString = null;
int count = 0;
do
{
// fill the buffer with data
count = resStream.Read(buf, 0, buf.Length);
// make sure we read some data
if (count != 0)
{
// translate from bytes to ASCII text
tempString = Encoding.ASCII.GetString(buf, 0, count);
// continue building the string
sb.Append(tempString);
}
}
while (count > 0); // any more data to read?
Response.Write(sb.ToString());
// regex to find all html links
// next - to use recursion to follow all links and write out
Regex r = new Regex(@"href=\""(.*?)\""",
RegexOptions.IgnoreCase | RegexOptions.Compiled);
MatchCollection matches = r.Matches(sb.ToString());
foreach (Match match in matches)
{
Array values = match.Value.ToCharArray();
foreach (System.Char line in values)
{
//Response.Write("{0} ", line);
File.AppendAllText(@"C:/Documents and Settings/Administrator/My Documents/Visual Studio 2008/Projects/ShoppingDataExtractor/spider.txt", ({0}) ,line);
sb.ToString();
//File.AppendAllText(@"C:/Documents and Settings/Administrator/My Documents/Visual Studio 2008/Projects/ShoppingDataExtractor/spider.txt", match.Value + Environment.NewLine);
}
// Close the response to free resources.
}
myResponse.Close();
resStream.Close();
}
catch (Exception ex)
{
Response.Write(ex.ToString());
}
}
}
}