域名解析

URL:统一资源定位符
http://www.sina.com.cn/web/index.html

  • http:// - 协议
  • www.sina.com.cn - 域名
  • /web/index.html - 路径
    DNS - 域名解析服务
www.sina.com.cn -> 202.60.121.55, ...
...
#include <netdb.h>
struct hostent* gethostbyname (char const* name);
返回主机条目信息结构指针,失败返回NULL。
hostent
     h_name - 字符指针,指向主机官方名字符串
     h_aliases - 指向字符指针数组的指针,该数组中的每个元素都是字符指针,指向一个别名字符串,最后一个元素是一个NULL指针
     h_addrtype - 地址类型,AF_INET(IPv4)
     h_length - 地址字节数, 4字节(IPv4)
     h_addr_list - 指向结构体指针数组的指针,该数组中的每个元素都指向一个struct in_addr类型的结构体,其中存放着主机一个IP地址,最后一个元素是一个空指针
#include <arpa/inet.h>
char* inet_ntoa (struct in_addr addr);

代码示例

  • dns.c
#include <netdb.h>
#include <arpa/inet.h>
#include <stdio.h>
#include <stdlib.h>
int main (int argc, char* argv[]) {
	if (argc < 2) {
		printf ("用法:%s <主机域名>\n",
			argv[0]);
		return EXIT_FAILURE;
	}
	struct hostent* host =
		gethostbyname (argv[1]);
	if (! host) {
		perror ("gethostbyname");
		return EXIT_FAILURE;
	}
	if (host->h_addrtype == AF_INET) {
		printf ("主机官方名:\n");
		printf ("\t%s\n", host->h_name);
		printf ("主机别名表:\n");
		char** pp = host->h_aliases;
		while (*pp)
			printf ("\t%s\n", *pp++);
		printf ("主机地址表:\n");
		struct in_addr** pa =
			(struct in_addr**)
				host->h_addr_list;
		while (*pa)
			printf ("\t%s\n",
				inet_ntoa (**pa++));
	}
	return EXIT_SUCCESS;
}
  • 执行结果
    域名解析及HTTP_正则表达式
超文本传输协议(HTTP)
  1. 请求
GET /web/index.html HTTP/1.0<CR><NL>
Host: www.sina.com.cn
Accept: */*
Connection: Close/Keep-Alive
User-Agent: Mozilla/5.0
Referer: www.sina.com.cn<CR><NL><CR><NL>
  1. 响应
HTTP/1.0 200 OK
Server: nginx
Date: Wed, 26 Oct 2016 10:52:04 GMT
Content-Type: text/html;charset=UTF-8
Content-length: 1234
Connection: Close/Keep-Alive<CR><NL><CR><NL>
<html>
<head> ... </head>
<body> ... </body>
</html>

代码示例

  • http.c
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <strings.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main (int argc, char* argv[]) {
	if (argc < 3) {
		printf ("用法:%s <主机地址> "
			"<主机域名> [<资源路径>]\n",
			argv[0]);
		return EXIT_FAILURE;
	}
	char const* ip = argv[1];
	char const* domain = argv[2];
	char const* path = argc < 4 ?
		"" : argv[3];
	int sockfd = socket (PF_INET,
		SOCK_STREAM, 0);
	if (sockfd == -1) {
		perror ("socket");
		return EXIT_FAILURE;
	}
	struct sockaddr_in addr;
	bzero (&addr, sizeof (addr));
	addr.sin_family = AF_INET;
	addr.sin_port = htons (80);
	if (! inet_aton (ip,
		&addr.sin_addr)) {
		perror ("inet_aton");
		return EXIT_FAILURE;
	}
	if (connect (sockfd,
		(struct sockaddr*)&addr,
		sizeof (addr)) == -1) {
		perror ("connect");
		return EXIT_FAILURE;
	}
	char request[1024];
	sprintf (request,
		"GET /%s HTTP/1.0\r\n"
		"Host: %s\r\n"
		"Accept: */*\r\n"
		"Connection: Close\r\n"
		"User-Agent: Mozilla/5.0\r\n"
		"Referer: %s\r\n\r\n",
		path, domain, domain);
	if (send (sockfd, request,
		strlen (request), 0) == -1) {
		perror ("send");
		return EXIT_FAILURE;
	}
	for (;;) {
		char respond[1024] = {};
		ssize_t rlen = recv (sockfd,
			respond,
			sizeof (respond) - 1, 0);
		if (rlen == -1) {
			perror ("recv");
			return EXIT_FAILURE;
		}
		if (! rlen)
			break;
		printf ("%s", respond);
	}
	printf ("\n");
	close (sockfd);
	return EXIT_SUCCESS;
}
  • 执行结果
    域名解析及HTTP_数组_02
正则表达式

包含头文件

#include <regex.h>
  • regcomp - 编译正则表达式
  • regexec - 执行正则匹配
  • regfree - 释放正则表达式内存
... href=" http://www.sina.com.cn/web/index.html " ...
href="\s*\([^ >"]*\)\s*"
\s - 匹配任意空白字符(空格、制表、回车、换行)
* - 重复前一个匹配项任意次
[^ >"] - 匹配任意除空格大于号双引号以外的字符
\(和\) - 定义子表达式

代码示例

  • regex.c
#include <regex.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int main (int argc, char* argv[]) {
	if (argc < 2) {
		printf ("用法:%s <HTML文件>\n",
			argv[0]);
		return EXIT_FAILURE;
	}
	FILE* fp = fopen (argv[1], "r");
	if (! fp) {
		perror ("fopen");
		return EXIT_FAILURE;
	}
	if (fseek (fp, 0, SEEK_END) == -1) {
		perror ("fseek");
		return EXIT_FAILURE;
	}
	long size = ftell (fp);
	if (size == -1) {
		perror ("ftell");
		return EXIT_FAILURE;
	}
	char* buf= (char*)malloc (size + 1);
	if (! buf) {
		perror ("malloc");
		return EXIT_FAILURE;
	}
	if (fseek (fp, 0, SEEK_SET) == -1) {
		perror ("fseek");
		return EXIT_FAILURE;
	}
	if (fread (buf, 1, size, fp)!=size) {
		perror ("fread");
		return EXIT_FAILURE;
	}
	buf[size] = '\0';
	fclose (fp);
	regex_t ex;
	int error = regcomp (&ex,
	"href=\"\\s*\\([^ >\"]*\\)\\s*\"",0);
	if (error) {
		char errInfo[1024];
		regerror (error, &ex, errInfo,
			sizeof (errInfo));
		printf ("regcomp: %s\n",
			errInfo);
		return EXIT_FAILURE;
	}
	char const* html = buf;
	regmatch_t match[2];
	while (regexec (&ex, html, 2, match,
		0) != REG_NOMATCH) {
		html += match[1].rm_so;
		size_t len = match[1].rm_eo -
			match[1].rm_so;
		char* url = (char*)malloc (
			len + 1);
		memcpy (url, html, len);
		url[len] = '\0';
		printf ("%s\n", url);
		free (url);
		html += len + match[0].rm_eo -
			match[1].rm_eo;
	}
	regfree (&ex);
	free (buf);
	return EXIT_SUCCESS;
}

  • 执行结果
    域名解析及HTTP_#include_03