URLパーサをつくってみた

URLを「スキーム、ホスト、ポート、パス、クエリー」に分解するツールをつくってみました。

ソース

/**
 * url_parser.c
 **/

#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#define URL_MAX_LEN 2083 /* IEの制限と同じ数値に設定 */
#define SCHEME_MAX_LEN 32
#define SCHEME_DELIMITER "://"
#define DOMAIN_MAX_LEN URL_MAX_LEN

int main(int argc, char *argv[])
{

    char _url[URL_MAX_LEN+1], _scheme[SCHEME_MAX_LEN+1], _domain[DOMAIN_MAX_LEN+1], _path[URL_MAX_LEN+1];
    char *url = _url, *scheme = _scheme, *domain = _domain, *path = _path;
    char *host, *port, *query;
    char *scheme_pointer;
    char *str1, *token;
    char *saveptr1, *saveptr2;
    size_t scheme_length;
    
    if (argc != 2) {
	fprintf(stderr, "Usage: %s url\n", argv[0]);
	exit(EXIT_FAILURE);
    }
    
    if (strlen(argv[1]) > URL_MAX_LEN) {
	fprintf(stderr, "url too long\n");
	exit(EXIT_FAILURE);
    }
    
    /* 変数初期化 */
    str1 = argv[1];
    strcpy(url, argv[1]);
    strcpy(scheme, "");
    strcpy(domain, "");
    strcpy(path, "/");
    host = "";
    query = "";
    port = "";
    
    /* schemeが指定されているか確認 */
    scheme_pointer = strstr(url, SCHEME_DELIMITER);
    scheme_length = (scheme_pointer == NULL)? 0 : scheme_pointer-url;
        
    /* schemeの設定 */
    if(scheme_length > 0) { 
	
	if (scheme_length > SCHEME_MAX_LEN) {
	    fprintf(stderr, "scheme too long\n");
	    exit(EXIT_FAILURE);
	}
	
	strncpy(scheme, url, scheme_length);
	scheme[scheme_length] = '\0'; /* NULL文字を付け加える */
	
    }
    
    str1 = (scheme_length == 0)? str1 : &str1[scheme_length+strlen(SCHEME_DELIMITER)];
    
    /* domainの設定 */
    token = strtok_r(str1, "/", &saveptr1);
    if (token == NULL) {
	fprintf(stderr, "domain is not listed \n");
	exit(EXIT_FAILURE);
    }
    strcpy(domain, token);
    
    /* host */
    token = strtok_r(token, ":", &saveptr2);
    host = token;
    
    /* port */
    token = strtok_r(NULL, ":", &saveptr2);
    if (token != NULL) {
	port = token;
    }
    
    /* path */
    token = strtok_r(NULL, "?", &saveptr1);
    if (token != NULL) {
        strcat(path, token);
    }
    
    /* query */
    token = strtok_r(NULL, "", &saveptr1);
    if (token != NULL) {
        query = token;
    }
    
    printf("url    : %s\n", url);
    printf("scheme : %s\n", scheme);
    printf("host   : %s\n", host);
    printf("port   : %s\n", port);
    printf("path   : %s\n", path);
    printf("query  : %s\n", query);
    
    exit(EXIT_SUCCESS);
}

利用方法

$ ./url_parser "http://google.com"
url    : http://google.com
scheme : http
host   : google.com
port   : 
path   : /
query  :
$ ./url parser  "http://linuxjm.sourceforge.jp:80/cgi-bin/man.cgi?Pagename=test"
url    : http://linuxjm.sourceforge.jp:80/cgi-bin/man.cgi?Pagename=test
scheme : http
host   : linuxjm.sourceforge.jp
port   : 80
path   : /cgi-bin/man.cgi
query  : Pagename=test