HW 1 Query 1.
This page will implement the following websql query:
select d1.url,d1.title from document d1 such that http://www.umr.edu -> d1;


-> d1;



Steps to implement this query:
1. retrieve file (html file) at the URL.
Extra 1.1: get file check their length, calculate length
Extra 1.2: store url, length at document table for future queries
2. parse file, get all href tags.
3. Update anchor table: store url at anchor.base field, store href tags at anchor.href field.
4. search file (or retrieve files) mentioned at href tags.
5. get title of these files by searching (or parsing).
6. Output results:

====================================================================

 

".strlen($final).""; // get file length $final_length = strlen($final); $conn = odbc_connect ('websql', '', '') or die('Could Not Connect to ODBC Database!'); //open DB connection $sql="INSERT INTO document (url, length) values ('$url', $final_length)"; $rs=odbc_exec($conn,$sql); $f = @fopen($url,"r"); while( $buf = fgets($f,1024) ) { $buf = fgets($f, 4096); //2. parse file, get all href tags. //preg_match_all("/<\s*a\s+[^>]*href\s*=\s*[\"']?([^\"' >]+)[\"' >]/isU",$buf,$words); preg_match_all("/<\s*a\s+[^>]*href\s*=\s*[\"']?([^\"' >]+)[\"' >]\s*>\s*([\w\s*]+)<\/a>/",$buf,$words); for( $j = 0; $words[1][$j]; $j++ ) { $cur_word_label = strtolower($words[2][$j]); //print "$cur_word_label
"; $cur_word_href = strtolower($words[1][$j]); //print "$cur_word_href
"; //3. Update anchor table: store url at anchor.base field, store href tags at anchor.href field. $sql="INSERT INTO anchor (base, label, href) values('$url', '$cur_word_label', '$cur_word_href')"; $rs=odbc_exec($conn,$sql); } } fclose($f); //4. retrieve files mentioned at href tags. $sql="SELECT * FROM anchor WHERE base LIKE '$url'"; $rs=odbc_exec($conn,$sql); /* print "

Anchor Table with this url $url:

"; echo ""; echo ""; echo ""; echo ""; while (odbc_fetch_row($rs)) { $conbase=odbc_result($rs,"base"); $conlabel=odbc_result($rs,"label"); $conhref=odbc_result($rs,"href"); echo ""; echo ""; echo ""; } echo "
baselabelhref
$conbase$conlabel$conhref
"; */ while (odbc_fetch_row($rs)) { $conhref=odbc_result($rs,"href"); if (preg_match ("/^http/",$conhref, $array)) { } else { $conhref = $url."/".$conhref; } print "$conhref
"; $f = @fopen($conhref,"r"); if ($f==FALSE) { // filter out false url streams $cur_word_title = "no title found"; print "$cur_word_title

"; } else { $content = $conhref; #Use the NuSoap php library require_once('nuSOAP/nusoap.php'); #set parameters $parameters = array( 'key'=>'m0nmJOdQFHLqAyh+2I5xIYzZ2f3kyFPg', 'q'=>$content, #query here 'start'=>'0', 'maxResults'=>'1', 'filter'=>'false', 'restrict'=>'', 'safeSearch'=>'false', 'lr'=>'', 'ie'=>'latin', 'oe'=>'latin' ); #Create a new soap client, feeding it to GoogleSearch.wsdl on Google's site $soapclient = new soapclient('http://api.google.com/GoogleSearch.wsdl','wsdl'); #query Google $results = $soapclient->call('doGoogleSearch',$parameters); #print_r($results); #print results if (is_array($results['resultElements'])) { // print "

The google query for '".$parameters['q']."' found ". $results['estimatedTotalResultsCount']." results, the top 1 results are:

"; foreach ($results['resultElements'] as $result) { // print "URL : "; // print "".$result['URL'].""; // print "
"; // print "Title : "; // print "".$result['title'].""; if ($result['title'] =="") { print "no title found"; } print $result['title']."

"; } } /* while( $buf = fgets($f,1024) ) { $buf = fgets($f, 4096); //print $buf."
"; // get title from pages preg_match_all("/<\s*title\s*>\s*([^<]+)\s*<\s*\/\\s*title\s*>/",$buf,$words); for( $j = 0; $words[1][$j]; $j++ ) { $cur_word_title = strtolower($words[1][$j]); print "$cur_word_title

"; } } */ fclose($f); } // End if.. else //print "$cur_word_title

"; } // End while (odbc_fetch_row($rs)) odbc_close($conn); //close DB connection } // of if (isset($_POST['url'])) ?>