Next Spaceship

Driving into future…

Obtain the Title of a Website

| Comments

The idea is to retrieve the html file of the website and parse it to find the content between <title> and </title>.

To get the html file, we need a library in C++, as the standard library has no such functions. I choose the libcurl library, and you can download it here: http://curl.haxx.se/libcurl/.

To parse the html file, I use the regular expression library, which is based on GNU systems.

Here is my code:

Get The Title Of A Website
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#include <stdlib.h>
#include <stdio.h>
#include <curl/curl.h>
#include <regex.h>
#include <string.h>

//Created by Leon
//http://leons.im
//Mar, 21, 2011

FILE *fin;
FILE *ftmp;
FILE *fout;

size_t write_data (void *buffer, size_t size, size_t nmemb, void *userp) {
    return size * nmemb;
}

int main (void) {
    CURL *curl;
    CURLcode res;

    fin = fopen ("in.txt", "r");
    ftmp = fopen("tmp.txt", "w");
    fout = fopen("out.txt", "w");
    curl = curl_easy_init();

    if(curl) {
        printf("Got easy handle...n");
        curl_easy_setopt(curl, CURLOPT_URL, "http://leons.im");
        //curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data);
        curl_easy_setopt(curl, CURLOPT_WRITEDATA, ftmp);
        //curl_easy_setopt(curl, CURLOPT_PROXY, "proxy.com:8080");
        //curl_easy_setopt(curl, CURLOPT_PROXYUSERPWD, "user:pwd");
        printf("Options is set.n");
        res = curl_easy_perform(curl);
        printf("Performed.n");
        fclose(ftmp);

        ftmp = fopen ("tmp.txt", "rb");
        if (NULL == ftmp) {
            fputs ("File error", stderr);
            exit (1);
        }
        // Obtain file size:
        fseek (ftmp, 0, SEEK_END);
        long lSize = ftell (ftmp);
        rewind (ftmp);

        // Allocate memory to contain the whole file:
        char *buffer;
        buffer = (char *) malloc (sizeof(char) * lSize);
        if (NULL == buffer) {
            fputs ("Memory error", stderr);
            exit (2);
        }

        // Copy the file into the buffer:
        size_t result;
        result = fread (buffer, 1, lSize, ftmp);
        if (result != lSize) {
            fputs ("Reading error", stderr);
            exit (3);
        }

        // Regular expression compilation
        regex_t compiled;
        int res = regcomp (&compiled, "<title>\([^<]*\)</title>", REG_ICASE);
        if (0 != res) {
            fputs ("Regular expression compilation error.", stderr);
            exit (4);
        }

        // Regular expression match
        regmatch_t matchptr[2];
        char err_msg[80];
        res = regexec (&compiled, buffer, 2, matchptr, 0);
        if (0 != res) {
            regerror(res, &compiled, err_msg, 80);
            printf("%sn", err_msg);
            exit (5);
        }
        char *title = (char *)malloc(sizeof(char) * (matchptr[1].rm_eo - matchptr[1].rm_so) + 1);
        strncpy (title, buffer + matchptr[1].rm_so, matchptr[1].rm_eo - matchptr[1].rm_so);
        printf("%sn", title);
        fprintf(fout, "%sn", title);

        regfree (&compiled);
        free (buffer);
        /* always cleanup */
        curl_easy_cleanup(curl);
    }
    fclose(fin);
    fclose(ftmp);
    fclose(fout);
    return 0;
}

Comments