Monday, May 7, 2012

PHP xls, xlsx, ppt, pptx headers

.xls

application/vnd.ms-excel

.xlsx

application/vnd.openxmlformats-officedocument.spreadsheetml.sheet

.ppt

application/vnd.ms-powerpoint

.pptx

application/vnd.openxmlformats-officedocument.presentationml.presentation

And one of those you have listed is wrong:

.docx

application/vnd.openxmlformats-officedocument.wordprocessingml.document

Logging In With CURL and PHP

This is an example of how you can use CURL to "log in" and retrieve some protected info. I've showed the somewhat extreme case where you need to maintain cookies, spoof HTTP referer and use SSL. The example page used here is ClickBank – one of the largest digital product retailers. I've tried to comment the source as much as possible to make it easier to understand. See the end of this post for more information on CURL. You can also leave a comment if you have further questions.
 
/*********************************
**Set up your variables**
**********************************/
$cookiefile = tempnam("/tmp", "cookies");
/* Create a temporary file to store cookies.
   This should work on most systems and is more
   flexible than specifying path explicitly */
 
/* The page that displays the login form. */
 
/* The "action" value of the login form. This is not always
    equal to $login_url. */
 
$username = "username";
$password = "passw0rd";
 
$agent="Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)";
 
/*********************************
**Load the "login" page and get some cookies**
**********************************/
 
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL,$login_url);
/* The URL of the page to retrieve */
 
curl_setopt($ch, CURLOPT_USERAGENT, $agent);
/* Disguise self as a browser app. Some servers
might need a different value here. Some servers
might try to check if the page is visited by a
real human being using this value. */
 
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
/* Don't output the results -
   return them as a string instead */
 
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
/* Follow redirects.
This isn't actually necessary here :P */
 
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookiefile);
/* Read cookies from this file */
 
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookiefile);
/* Save cookies to the same file too */
 
/* SSL stuff - remove if not needed */
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST,  2);
/* Check the existence of a common name and also
verify that it matches the hostname provided. Not
strictly necessary in most cases. Use 0 to disable. */
 
/* SSL stuff - remove if not needed */
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
/* Turn off SSL peer certificate verification. This prevents
the "SSL certificate problem, verify that the CA cert is OK."
error. If you really need this set to "true",
see this link for a solution -
 
 
*/
 
$result = curl_exec ($ch);
/* Perform the query, retrieve the page. */
 
curl_close ($ch);
 
 
 
 
/*************************************
Actually log in with the proper referer and cookies
**************************************/
 
/* The fields of the login form. These will probably be
  different for every particular page. */
$postfields = array(
        'nick'  => $username,
        'pass' => $password,
        //'rememberMe' => 'false',
        'j_username' => $username,
        'j_password' => $password,
    );
 
$reffer = $login_url;
/* If the server checks the referer we need to spoof it */
 
$ch = curl_init();
 
curl_setopt($ch, CURLOPT_URL,$login_post_url);
curl_setopt($ch, CURLOPT_USERAGENT, $agent);
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS,
       http_build_query($postfields));
/* http_build_query() will properly escape the fields and
  build a query string. */
 
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
/* Follow redirects. This is probably necessary here. */
curl_setopt($ch, CURLOPT_REFERER, $reffer);
/* spoof the HTTP referer */
 
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookiefile);
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookiefile);
/* Note that this is the same file as before */
 
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST,  2);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
 
$result = curl_exec ($ch);
/* Now we've got the contents of the page you see after
  logging in saved in $result */
 
curl_close ($ch);
 
/*****************************************
**If you need to get another page....**
This is similar to the above examples, just use the same
cookie file and maybe spoof the referer if needed
******************************************/
 
$reffer = $login_post_url;
 
$ch = curl_init();
 
curl_setopt($ch, CURLOPT_URL,$data_url);
curl_setopt($ch, CURLOPT_USERAGENT, $agent);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_REFERER, $reffer);
curl_setopt($ch, CURLOPT_COOKIEFILE, $cookiefile);
curl_setopt($ch, CURLOPT_COOKIEJAR, $cookiefile);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST,  2);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
 
$result = curl_exec ($ch);
curl_close ($ch);
 
echo $result;
 
/******************************************
**All done. Kill the cookie file once it's not needed anymore**
*******************************************/
unlink($cookiefile);
 

How To *Really* Upload Files With PHP

I've noticed that whenever I search for a "php upload script" or "how to upload files", the first page of Gogle results is full of pages that tell you how to handle file uploads = letting users to upload something to your server. But what if you actually want to create a PHP script that will upload a file to another website?

So today I will show you a simple PHP upload script that simulates a web browser uploading a file through a file submission form. I'll be using the excellent CURL library for this purpose.

So, without further ado, here's the upload script :
 
 
/* This should be a fully qualified filename of the file you
want to upload. Relative filenames may not work. */
$filename = "/folder/file.txt";
 
/* The page that processes the upload. You can usually
find this in the "ACTION" field of the HTML form. */
 
/* The form fields and their values. */
$post_data = array(
    "field1" => "value1",
    "field2" => "value2",
    "file_field" => "@$filename"
);
/* The general format for file upload fields is '@file_name_here'.
If the form you're has multiple file fields you can upload
several files if you use the correct field names. */
 
$ch = curl_init($submit_url);
/* Follow redirects (optional) */
curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
/* Use the "POST" method (possibly redundant) */
curl_setopt($ch, CURLOPT_POST, 1);
curl_setopt($ch, CURLOPT_POSTFIELDS, $post_data);
 
/* Upload the file now */
$results = curl_exec($ch);
curl_close ($ch);
/* Now the $results variable contains any response the
$submit_url might have returned, or false if upload failed*/
 

Naturally you will need to modify the $submit_url and other variables to fit your situation.

Also note that in most practical cases you must "log in" to the target website before you can upload anything, so you might want to read my how to log in with PHP and CURL post.

How To Check If Page Exists With CURL

Here's a relatively simple PHP function that will check if an URL really leads to a valid page (as opposed to generating "404 Not Found" or some other kind of error). It uses the CURL library – if your server doesn't have it installed, see "Alternatives" at the end of this post. This script may be useful for finding broken links and similar tasks.
 
function page_exists($url){
  $parts=parse_url($url);
  if(!$parts) return false; /* the URL was seriously wrong */
 
  $ch = curl_init();
  curl_setopt($ch, CURLOPT_URL, $url);
 
  /* set the user agent - might help, doesn't hurt */
  curl_setopt($ch, CURLOPT_USERAGENT, 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)');
  curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
 
  /* try to follow redirects */
  curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
 
  /* timeout after the specified number of seconds. assuming that this script runs
    on a server, 20 seconds should be plenty of time to verify a valid URL.  */
  curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 15);
  curl_setopt($ch, CURLOPT_TIMEOUT, 20);
 
  /* don't download the page, just the header (much faster in this case) */
  curl_setopt($ch, CURLOPT_NOBODY, true);
  curl_setopt($ch, CURLOPT_HEADER, true);
 
  /* handle HTTPS links */
  if($parts['scheme']=='https'){
    curl_setopt($ch, CURLOPT_SSL_VERIFYHOST,  1);
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
  }
 
  $response = curl_exec($ch);
  curl_close($ch);
 
  /*  get the status code from HTTP headers */
  if(preg_match('/HTTP\/1\.\d+\s+(\d+)/', $response, $matches)){
    $code=intval($matches[1]);
  } else {
    return false;
  };
 
  /* see if code indicates success */
  return (($code>=200) && ($code<400));
}
 
 

Notes on implementation
I've used a somewhat liberal interpretation of "exists" here – this function will return TRUE even when URL redirects to a different page. I think that this is generally a good idea.

Another thing to note is that this function expects a fully qualified and well-formed URL. Checking if a random string represents a syntactically valid URL is not the it's purpose and would be very inefficient + error-prone.

If you're familiar with CURL you might know about the CURLOPT_FAILONERROR option which is supposed to make curl_exec() treat a non-existent page as an error. It might seem that with this option set page_exists() might be simplified by only checking if $response equals FALSE (indicating an error). Well, that doesn't really work, at least not as expected. In my tests CURLOPT_FAILONERROR made curl_exec() fail when the returned HTTP status code was 302 – a form of temporary redirect. Needless to say the URL in question worked fine in my browser so I decided to blame CURL and revise the function to explicitly check the status code, treating all codes in the 2XX – 3XX range as success.

Alternatives
If you can't or don't want to use CURL there are other ways to see if a page exists.

  • fopen() – try opening the URL as a file and hope the fopen() URL wrapper is enabled. You can find lots of similar examples on Google.
$handle = @fopen($url,'r');
if($handle !== false){
   echo 'Page Exists';
else {
   echo 'Page Not Found';
}
  • fsockopen() – use sockets to connect to the target host, build the HTTP request by hand and analyze the server's response. See some page-checking examples in the comments for the fsockopen() function on php.net. IMHO this method is a bit of overkill – it's complex and may lead to strange bugs if you don't know exactly what you're doing.

Checking If Page Contains a Link In PHP

Sometimes it is necessary to verify that a given page really contains a specific link. This is usually done when checking for a reciprocal link in link exchange scripts and so on.

Several things need to be considered in this situation :

  • Only actual links count. A plain-text URL should not be accepted.
  • Links inside HTML comments (<!– … –>) are are no good.
  • Nofollow'ed links are out as well.
Here's a PHP function that satisfies these requirements
 
function contains_link($page_url, $link_url) {
    /* Get the page at page_url */
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $page_url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
 
    curl_setopt($ch, CURLOPT_USERAGENT,
      'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)');
 
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 30);
    curl_setopt($ch, CURLOPT_TIMEOUT, 60);
    curl_setopt($ch, CURLOPT_FAILONERROR, true);
 
    $html = curl_exec($ch);
    curl_close($ch);
 
    if(!$html) return false;
 
    /* Remove HTML comments and their contents */
    $html = preg_replace('/<!--.*-->/i', '', $html);
 
    /* Extract all links */
    $regexp='/(<a[\s]+[^>]*href\s*=\s*[\"\']?)([^\'\" >]+)([\'\"]+[^<>]*>)/i';
    if (!preg_match_all($regexp, $html, $matches, PREG_SET_ORDER)) {
        return false; /* No links on page */
    };
 
    /* Check each link */
    foreach($matches as $match){
        /* Skip links that contain rel=nofollow */
        if(preg_match('/rel\s*=\s*[\'\"]?nofollow[\'\"]?/i', $match[0])) continue;
        /* If URL = backlink_url, we've found the backlink */
        if ($match[2]==$link_url) return true;
    }
 
    return false;
}
 
/* Usage example */
 
    echo 'Reciprocal link found.';
} else {
    echo 'Reciprocal link not found.';
};

How To Extract All URLs From A Page Using PHP

Recently I needed a crawler script that would create a list of all pages on a single domain. As a part of that I wrote some functions that could download a page, extract all URLs from the HTML and turn them into absolute URLs (so that they themselves can be crawled later). Here's the PHP code.

Extracting All Links From A Page
Here's a function that will download the specified URL and extract all links from the HTML. It also translates relative URLs to absolute URLs, tries to remove repeated links and is overall a fine piece of code :) Depending on your goal you may want to comment out some lines (e.g. the part that strips '#something' (in-page links) from URLs).
 
function crawl_page($page_url, $domain) {
/* $page_url - page to extract links from, $domain -
    crawl only this domain (and subdomains)
    Returns an array of absolute URLs or false on failure.
*/
 
/* I'm using cURL to retrieve the page */
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $page_url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
 
/* Spoof the User-Agent header value; just to be safe */
    curl_setopt($ch, CURLOPT_USERAGENT,
      'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0)');
 
/* I set timeout values for the connection and download
because I don't want my script to get stuck
downloading huge files or trying to connect to
a nonresponsive server. These are optional. */
    curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10);
    curl_setopt($ch, CURLOPT_TIMEOUT, 15);
 
/* This ensures 404 Not Found (and similar) will be
    treated as errors */
    curl_setopt($ch, CURLOPT_FAILONERROR, true);
 
/* This might/should help against accidentally
  downloading mp3 files and such, but it
  doesn't really work :/  */
    $header[] = "Accept: text/html, text/*";
    curl_setopt($ch, CURLOPT_HTTPHEADER, $header);
 
/* Download the page */
    $html = curl_exec($ch);
    curl_close($ch);
 
    if(!$html) return false;
 
/* Extract the BASE tag (if present) for
  relative-to-absolute URL conversions later */
    if(preg_match('/<base[\s]+href=\s*[\"\']?([^\'\" >]+)[\'\" >]/i',$html, $matches)){
        $base_url=$matches[1];
    } else {
        $base_url=$page_url;
    }
 
    $links=array();
 
    $html = str_replace("\n", ' ', $html);
    preg_match_all('/<a[\s]+[^>]*href\s*=\s*([\"\']+)([^>]+?)(\1|>)/i', $html, $m);
/* this regexp is a combination of numerous
    versions I saw online; should be good. */
 
    foreach($m[2] as $url) {
        $url=trim($url);
 
        /* get rid of PHPSESSID, #linkname, &amp; and javascript: */
        $url=preg_replace(
            array('/([\?&]PHPSESSID=\w+)$/i','/(#[^\/]*)$/i', '/&amp;/','/^(javascript:.*)/i'),
            array('','','&',''),
            $url);
 
        /* turn relative URLs into absolute URLs.
          relative2absolute() is defined further down
          below on this page. */
            $url = relative2absolute($base_url, $url);   
 
            // check if in the same (sub-)$domain
            if(preg_match("/^http[s]?:\/\/[^\/]*".str_replace('.', '\.', $domain)."/i", $url)) {
                //save the URL
                if(!in_array($url, $links)) $links[]=$url;
            }
    }
 
    return $links;
}
 
How To Translate a Relative URL to an Absolute URL
This script is based on a function I found on the web with some small but significant changes.
 
function relative2absolute($absolute, $relative) {
        $p = @parse_url($relative);
        if(!$p) {
            //$relative is a seriously malformed URL
            return false;
        }
        if(isset($p["scheme"])) return $relative;
 
        $parts=(parse_url($absolute));
 
        if(substr($relative,0,1)=='/') {
            $cparts = (explode("/", $relative));
            array_shift($cparts);
        } else {
            if(isset($parts['path'])){
                 $aparts=explode('/',$parts['path']);
                 array_pop($aparts);
                 $aparts=array_filter($aparts);
            } else {
                 $aparts=array();
            }
           $rparts = (explode("/", $relative));
           $cparts = array_merge($aparts, $rparts);
           foreach($cparts as $i => $part) {
                if($part == '.') {
                    unset($cparts[$i]);
                } else if($part == '..') {
                    unset($cparts[$i]);
                    unset($cparts[$i-1]);
                }
            }
        }
        $path = implode("/", $cparts);
 
        $url = '';
        if($parts['scheme']) {
            $url = "$parts[scheme]://";
        }
        if(isset($parts['user'])) {
            $url .= $parts['user'];
            if(isset($parts['pass'])) {
                $url .= ":".$parts['pass'];
            }
            $url .= "@";
        }
        if(isset($parts['host'])) {
            $url .= $parts['host']."/";
        }
        $url .= $path;
 
        return $url;
}

How To Force File Download With PHP

Say you want a PHP script that will make the browser to download a file instead of opening it. This is useful for common filetypes that would normally be displayed in a browser, like .html, images, PDFs and .doc files.

You can find a lot of scripts that do this on the web but one thing most of them don't handle is download resuming and multi-thread downloading. If you don't need that feature, feel free to use a simpler script. Personally I've found that a function that handles download resuming works more reliably across various browsers (what actually happened : a simpler script didn't work with Opera + my weird Internet connection, so I found another script and decided to make broad generalizations about download resuming :p).

Note – most of the code below isn't mine. I found it somewhere on the web and adjusted it for my needs; unfortunately I've lost the URL of the original page. If you know where it came from, let me know and I'll add a link to this post.
 
function output_file($file, $name, $mime_type='')
{
 /*
 This function takes a path to a file to output ($file),
 the filename that the browser will see ($name) and
 the MIME type of the file ($mime_type, optional).
 
 If you want to do something on download abort/finish,
 register_shutdown_function('function_name');
 */
 if(!is_readable($file)) die('File not found or inaccessible!');
 
 $size = filesize($file);
 $name = rawurldecode($name);
 
 /* Figure out the MIME type (if not specified) */
 $known_mime_types=array(
    "pdf" => "application/pdf",
    "txt" => "text/plain",
    "html" => "text/html",
    "htm" => "text/html",
    "exe" => "application/octet-stream",
    "zip" => "application/zip",
    "doc" => "application/msword",
    "xls" => "application/vnd.ms-excel",
    "ppt" => "application/vnd.ms-powerpoint",
    "gif" => "image/gif",
    "png" => "image/png",
    "jpeg"=> "image/jpg",
    "jpg" =>  "image/jpg",
    "php" => "text/plain"
 );
 
 if($mime_type==''){
     $file_extension = strtolower(substr(strrchr($file,"."),1));
     if(array_key_exists($file_extension, $known_mime_types)){
        $mime_type=$known_mime_types[$file_extension];
     } else {
        $mime_type="application/force-download";
     };
 };
 
 @ob_end_clean(); //turn off output buffering to decrease cpu usage
 
 // required for IE, otherwise Content-Disposition may be ignored
 if(ini_get('zlib.output_compression'))
  ini_set('zlib.output_compression', 'Off');
 
 header('Content-Type: ' . $mime_type);
 header('Content-Disposition: attachment; filename="'.$name.'"');
 header("Content-Transfer-Encoding: binary");
 header('Accept-Ranges: bytes');
 
 /* The three lines below basically make the
    download non-cacheable */
 header("Cache-control: private");
 header('Pragma: private');
 header("Expires: Mon, 26 Jul 1997 05:00:00 GMT");
 
 // multipart-download and download resuming support
 if(isset($_SERVER['HTTP_RANGE']))
 {
    list($a, $range) = explode("=",$_SERVER['HTTP_RANGE'],2);
    list($range) = explode(",",$range,2);
    list($range, $range_end) = explode("-", $range);
    $range=intval($range);
    if(!$range_end) {
        $range_end=$size-1;
    } else {
        $range_end=intval($range_end);
    }
 
    $new_length = $range_end-$range+1;
    header("HTTP/1.1 206 Partial Content");
    header("Content-Length: $new_length");
    header("Content-Range: bytes $range-$range_end/$size");
 } else {
    $new_length=$size;
    header("Content-Length: ".$size);
 }
 
 /* output the file itself */
 $chunksize = 1*(1024*1024); //you may want to change this
 $bytes_send = 0;
 if ($file = fopen($file, 'r'))
 {
    if(isset($_SERVER['HTTP_RANGE']))
    fseek($file, $range);
 
    while(!feof($file) &&
        (!connection_aborted()) &&
        ($bytes_send<$new_length)
          )
    {
        $buffer = fread($file, $chunksize);
        print($buffer); //echo($buffer); // is also possible
        flush();
        $bytes_send += strlen($buffer);
    }
 fclose($file);
 } else die('Error - can not open file.');
 
die();
}  
 
/*********************************************
            Example of use
**********************************************/
 
/*
Make sure script execution doesn't time out.
Set maximum execution time in seconds (0 means no limit).
*/
set_time_limit(0);
$file_path='that_one_file.txt';
output_file($file_path, 'some file.txt', 'text/plain');