Wednesday, January 26, 2011

Linux Socket Programming, TCP, a simple HTTP client

    Linux provide a high level socket API that will allow programmer to easily connect to any TCP or UDP services.
    In this tutorial, we will see how this works by implementing a simple HTTP client which will get request a web page given the hostname and the page name, then read the server answer and output the HTML content of the reply.
    To be able to connect to a service built on top of TCP, we first need to create a socket for the TCP protocol, fill in a network address structure representing our destination and the port to connect to and use the latter to connect to the remote server.
    From there, we will be able to send and receive data over the network. Once we are done, we will close the connection.
    Below is the C code for a simple HTTP client that will get the host and the page to request from the command line arguments, resolve the hostname name to an IP, connect to this IP on port 80, build the HTTP query, send it and the retrieve the page content.

  1. #include <stdio.h>
  2. #include <sys/socket.h>
  3. #include <arpa/inet.h>
  4. #include <stdlib.h>
  5. #include <netdb.h>
  6. #include <string.h>
  7. int create_tcp_socket();
  8. char *get_ip(char *host);
  9. char *build_get_query(char *host, char *page);
  10. void usage();
  11.  
  12. #define HOST "coding.debuntu.org"
  13. #define PAGE "/"
  14. #define PORT 80
  15. #define USERAGENT "HTMLGET 1.0"
  16.  
  17. int main(int argc, char **argv)
  18. {
  19.   struct sockaddr_in *remote;
  20.   int sock;
  21.   int tmpres;
  22.   char *ip;
  23.   char *get;
  24.   char buf[BUFSIZ+1];
  25.   char *host;
  26.   char *page;
  27.  
  28.   if(argc == 1){
  29.     usage();
  30.     exit(2);
  31.   }  
  32.   host = argv[1];
  33.   if(argc > 2){
  34.     page = argv[2];
  35.   }else{
  36.     page = PAGE;
  37.   }
  38.   sock = create_tcp_socket();
  39.   ip = get_ip(host);
  40.   fprintf(stderr, "IP is %s\n", ip);
  41.   remote = (struct sockaddr_in *)malloc(sizeof(struct sockaddr_in *));
  42.   remote->sin_family = AF_INET;
  43.   tmpres = inet_pton(AF_INET, ip, (void *)(&(remote->sin_addr.s_addr)));
  44.   if( tmpres < 0)  
  45.   {
  46.     perror("Can't set remote->sin_addr.s_addr");
  47.     exit(1);
  48.   }else if(tmpres == 0)
  49.   {
  50.     fprintf(stderr, "%s is not a valid IP address\n", ip);
  51.     exit(1);
  52.   }
  53.   remote->sin_port = htons(PORT);
  54.  
  55.   if(connect(sock, (struct sockaddr *)remote, sizeof(struct sockaddr)) < 0){
  56.     perror("Could not connect");
  57.     exit(1);
  58.   }
  59.   get = build_get_query(host, page);
  60.   fprintf(stderr, "Query is:\n<<START>>\n%s<<END>>\n", get);
  61.  
  62.   //Send the query to the server
  63.   int sent = 0;
  64.   while(sent < strlen(get))
  65.   {
  66.     tmpres = send(sock, get+sent, strlen(get)-sent, 0);
  67.     if(tmpres == -1){
  68.       perror("Can't send query");
  69.       exit(1);
  70.     }
  71.     sent += tmpres;
  72.   }
  73.   //now it is time to receive the page
  74.   memset(buf, 0, sizeof(buf));
  75.   int htmlstart = 0;
  76.   char * htmlcontent;
  77.   while((tmpres = recv(sock, buf, BUFSIZ, 0)) > 0){
  78.     if(htmlstart == 0)
  79.     {
  80.       /* Under certain conditions this will not work.
  81.       * If the \r\n\r\n part is splitted into two messages
  82.       * it will fail to detect the beginning of HTML content
  83.       */
  84.       htmlcontent = strstr(buf, "\r\n\r\n");
  85.       if(htmlcontent != NULL){
  86.         htmlstart = 1;
  87.         htmlcontent += 4;
  88.       }
  89.     }else{
  90.       htmlcontent = buf;
  91.     }
  92.     if(htmlstart){
  93.       fprintf(stdout, htmlcontent);
  94.     }
  95.  
  96.     memset(buf, 0, tmpres);
  97.   }
  98.   if(tmpres < 0)
  99.   {
  100.     perror("Error receiving data");
  101.   }
  102.   free(get);
  103.   free(remote);
  104.   free(ip);
  105.   close(sock);
  106.   return 0;
  107. }
  108.  
  109. void usage()
  110. {
  111.   fprintf(stderr, "USAGE: htmlget host [page]\n\
  112. \thost: the website hostname. ex: coding.debuntu.org\n\
  113. \tpage: the page to retrieve. ex: index.html, default: /\n");
  114. }
  115.  
  116.  
  117. int create_tcp_socket()
  118. {
  119.   int sock;
  120.   if((sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0){
  121.     perror("Can't create TCP socket");
  122.     exit(1);
  123.   }
  124.   return sock;
  125. }
  126.  
  127.  
  128. char *get_ip(char *host)
  129. {
  130.   struct hostent *hent;
  131.   int iplen = 15; //XXX.XXX.XXX.XXX
  132.   char *ip = (char *)malloc(iplen+1);
  133.   memset(ip, 0, iplen+1);
  134.   if((hent = gethostbyname(host)) == NULL)
  135.   {
  136.     herror("Can't get IP");
  137.     exit(1);
  138.   }
  139.   if(inet_ntop(AF_INET, (void *)hent->h_addr_list[0], ip, iplen) == NULL)
  140.   {
  141.     perror("Can't resolve host");
  142.     exit(1);
  143.   }
  144.   return ip;
  145. }
  146.  
  147. char *build_get_query(char *host, char *page)
  148. {
  149.   char *query;
  150.   char *getpage = page;
  151.   char *tpl = "GET /%s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s\r\n\r\n";
  152.   if(getpage[0] == '/'){
  153.     getpage = getpage + 1;
  154.     fprintf(stderr,"Removing leading \"/\", converting %s to %s\n", page, getpage);
  155.   }
  156.   // -5 is to consider the %s %s %s in tpl and the ending \0
  157.   query = (char *)malloc(strlen(host)+strlen(getpage)+strlen(USERAGENT)+strlen(tpl)-5);
  158.   sprintf(query, tpl, getpage, host, USERAGENT);
  159.   return query;
  160. }

To compile it, run:
    $ gcc -o htmlget htmlget.c
    $ ./htmlget 
    USAGE: htmlget host [page]
 host: the website hostname. ex: coding.debuntu.org
 page: the page to retrieve. ex: index.html, default: /
Informative messages and errors are printed to stderr. The content of the page is printed to stdout. Thus, to save the HTML content of a page to a file, you will need to run:
    $ ./htmlget coding.debuntu.org category > /tmp/page.html
Explaination:
    Let's go over some sections from the source.
        Line 38, we create the socket by calling a custom function: create_tcp_socket defined from line 117 to 125.
  1. int create_tcp_socket()
  2. {
  3.   int sock;
  4.   if((sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0){
  5.     perror("Can't create TCP socket");
  6.     exit(1);
  7.   }
  8.   return sock;
  9. }
    In order to have a TCP socket, the domain has to be AF_INET for IPv4, the type of the socket is SOCK_STREAM in order to have a connection-oriented socket, and finally, the protocol is set to IPPROTO_TCP for TCP.
    Then we call get_ip(), defined from line 128-145. get_ip takes a hostname as an argument and will attempt to convert it to a string representing its IP address.
  1. char *get_ip(char *host)
  2. {
  3.   struct hostent *hent;
  4.   int iplen = 15; //XXX.XXX.XXX.XXX
  5.   char *ip = (char *)malloc(iplen+1);
  6.   memset(ip, 0, iplen+1);
  7.   if((hent = gethostbyname(host)) == NULL)
  8.   {
  9.     herror("Can't get IP");
  10.     exit(1);
  11.   }
  12.   if(inet_ntop(AF_INET, (void *)hent->h_addr_list[0], ip, iplen ) == NULL)
  13.   {
  14.     perror("Can't resolve host");
  15.     exit(1);
  16.   }
  17.   return ip;
  18. }
    Let's look at this function a bit closer. First we allocate just enough characters to hold an IP address string. Then, we call gethostbyname, which on success return a non-NULL pointer to a struct of type hostent, which will hold all the aliases and network adresses (in network byte order). We then convert the first network address to a string by using inet_ntop and return the string.
    Back to main, from line 41 to 53, we set the remote address to finally connect our socket to it on line 55.
    Now, our socket is ready to receive or send packet.
    Line 59, we build the HTTP query and send it from line 63 to 72. As there is no guarantee that the packet is sent in one go, we need to use a loop that will make sure that all the bytes are sent.
    Line 77 to 97, we retrieve the reply from the server. Same here, we need to loop over as we might not receive all the bytes in one shot. This algo will fail to detect the beginning of the HTML content if the "\r\n\r\n" sequence in retrieve in 2 times. But anyway, this is good enough for the example :D.
    Finally, we clean up the ressources we allocated manually.

1 comment: