In this tutorial, we will see how this works by implementing a simple HTTP client which will get request a web page given the hostname and the page name, then read the server answer and output the HTML content of the reply.
To be able to connect to a service built on top of TCP, we first need to create a socket for the TCP protocol, fill in a network address structure representing our destination and the port to connect to and use the latter to connect to the remote server.
From there, we will be able to send and receive data over the network. Once we are done, we will close the connection.
Below is the C code for a simple HTTP client that will get the host and the page to request from the command line arguments, resolve the hostname name to an IP, connect to this IP on port 80, build the HTTP query, send it and the retrieve the page content.
- #include <stdio.h>
- #include <sys/socket.h>
- #include <arpa/inet.h>
- #include <stdlib.h>
- #include <netdb.h>
- #include <string.h>
- int create_tcp_socket();
- char *get_ip(char *host);
- char *build_get_query(char *host, char *page);
- void usage();
- #define HOST "coding.debuntu.org"
- #define PAGE "/"
- #define PORT 80
- #define USERAGENT "HTMLGET 1.0"
- int main(int argc, char **argv)
- {
- struct sockaddr_in *remote;
- int sock;
- int tmpres;
- char *ip;
- char *get;
- char buf[BUFSIZ+1];
- char *host;
- char *page;
- if(argc == 1){
- usage();
- exit(2);
- }
- host = argv[1];
- if(argc > 2){
- page = argv[2];
- }else{
- page = PAGE;
- }
- sock = create_tcp_socket();
- ip = get_ip(host);
- fprintf(stderr, "IP is %s\n", ip);
- remote = (struct sockaddr_in *)malloc(sizeof(struct sockaddr_in *));
- remote->sin_family = AF_INET;
- tmpres = inet_pton(AF_INET, ip, (void *)(&(remote->sin_addr.s_addr)));
- if( tmpres < 0)
- {
- perror("Can't set remote->sin_addr.s_addr");
- exit(1);
- }else if(tmpres == 0)
- {
- fprintf(stderr, "%s is not a valid IP address\n", ip);
- exit(1);
- }
- remote->sin_port = htons(PORT);
- if(connect(sock, (struct sockaddr *)remote, sizeof(struct sockaddr)) < 0){
- perror("Could not connect");
- exit(1);
- }
- get = build_get_query(host, page);
- fprintf(stderr, "Query is:\n<<START>>\n%s<<END>>\n", get);
- //Send the query to the server
- int sent = 0;
- while(sent < strlen(get))
- {
- tmpres = send(sock, get+sent, strlen(get)-sent, 0);
- if(tmpres == -1){
- perror("Can't send query");
- exit(1);
- }
- sent += tmpres;
- }
- //now it is time to receive the page
- memset(buf, 0, sizeof(buf));
- int htmlstart = 0;
- char * htmlcontent;
- while((tmpres = recv(sock, buf, BUFSIZ, 0)) > 0){
- if(htmlstart == 0)
- {
- /* Under certain conditions this will not work.
- * If the \r\n\r\n part is splitted into two messages
- * it will fail to detect the beginning of HTML content
- */
- htmlcontent = strstr(buf, "\r\n\r\n");
- if(htmlcontent != NULL){
- htmlstart = 1;
- htmlcontent += 4;
- }
- }else{
- htmlcontent = buf;
- }
- if(htmlstart){
- fprintf(stdout, htmlcontent);
- }
- memset(buf, 0, tmpres);
- }
- if(tmpres < 0)
- {
- perror("Error receiving data");
- }
- free(get);
- free(remote);
- free(ip);
- close(sock);
- return 0;
- }
- void usage()
- {
- fprintf(stderr, "USAGE: htmlget host [page]\n\
- \thost: the website hostname. ex: coding.debuntu.org\n\
- \tpage: the page to retrieve. ex: index.html, default: /\n");
- }
- int create_tcp_socket()
- {
- int sock;
- if((sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0){
- perror("Can't create TCP socket");
- exit(1);
- }
- return sock;
- }
- char *get_ip(char *host)
- {
- struct hostent *hent;
- int iplen = 15; //XXX.XXX.XXX.XXX
- char *ip = (char *)malloc(iplen+1);
- memset(ip, 0, iplen+1);
- if((hent = gethostbyname(host)) == NULL)
- {
- herror("Can't get IP");
- exit(1);
- }
- if(inet_ntop(AF_INET, (void *)hent->h_addr_list[0], ip, iplen) == NULL)
- {
- perror("Can't resolve host");
- exit(1);
- }
- return ip;
- }
- char *build_get_query(char *host, char *page)
- {
- char *query;
- char *getpage = page;
- char *tpl = "GET /%s HTTP/1.0\r\nHost: %s\r\nUser-Agent: %s\r\n\r\n";
- if(getpage[0] == '/'){
- getpage = getpage + 1;
- fprintf(stderr,"Removing leading \"/\", converting %s to %s\n", page, getpage);
- }
- // -5 is to consider the %s %s %s in tpl and the ending \0
- query = (char *)malloc(strlen(host)+strlen(getpage)+strlen(USERAGENT)+strlen(tpl)-5);
- sprintf(query, tpl, getpage, host, USERAGENT);
- return query;
- }
To compile it, run:
$ gcc -o htmlget htmlget.c $ ./htmlget USAGE: htmlget host [page] host: the website hostname. ex: coding.debuntu.org page: the page to retrieve. ex: index.html, default: /Informative messages and errors are printed to stderr. The content of the page is printed to stdout. Thus, to save the HTML content of a page to a file, you will need to run:
$ ./htmlget coding.debuntu.org category > /tmp/page.html
Explaination:
Let's go over some sections from the source.Line 38, we create the socket by calling a custom function: create_tcp_socket defined from line 117 to 125.
- int create_tcp_socket()
- {
- int sock;
- if((sock = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) < 0){
- perror("Can't create TCP socket");
- exit(1);
- }
- return sock;
- }
Then we call get_ip(), defined from line 128-145. get_ip takes a hostname as an argument and will attempt to convert it to a string representing its IP address.
- char *get_ip(char *host)
- {
- struct hostent *hent;
- int iplen = 15; //XXX.XXX.XXX.XXX
- char *ip = (char *)malloc(iplen+1);
- memset(ip, 0, iplen+1);
- if((hent = gethostbyname(host)) == NULL)
- {
- herror("Can't get IP");
- exit(1);
- }
- if(inet_ntop(AF_INET, (void *)hent->h_addr_list[0], ip, iplen ) == NULL)
- {
- perror("Can't resolve host");
- exit(1);
- }
- return ip;
- }
Back to main, from line 41 to 53, we set the remote address to finally connect our socket to it on line 55.
Now, our socket is ready to receive or send packet.
Line 59, we build the HTTP query and send it from line 63 to 72. As there is no guarantee that the packet is sent in one go, we need to use a loop that will make sure that all the bytes are sent.
Line 77 to 97, we retrieve the reply from the server. Same here, we need to loop over as we might not receive all the bytes in one shot. This algo will fail to detect the beginning of the HTML content if the "\r\n\r\n" sequence in retrieve in 2 times. But anyway, this is good enough for the example :D.
Finally, we clean up the ressources we allocated manually.
c编程代码片段
ReplyDelete比较两个字符串示例代码的最后一个实例