使用 VC++ 获取网页源码(Unicode 版本)

要在 VC++ 中获取网页源码并正确处理 Unicode 编码,可以使用以下几种方法:

vc unicode 获取网页源码
(图片来源网络,侵删)

使用 WinINet API

#include <windows.h>
#include <wininet.h>
#include <tchar.h>
#include <string>
#pragma comment(lib, "wininet.lib")
std::wstring GetWebPageSource(const std::wstring& url)
{
    std::wstring result;
    HINTERNET hInternet = NULL;
    HINTERNET hConnect = NULL;
    HINTERNET hRequest = NULL;
    // 初始化 WinINet
    hInternet = InternetOpen(L"Mozilla/5.0", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
    if (hInternet)
    {
        // 连接到服务器
        hConnect = InternetConnect(hInternet, L"example.com", INTERNET_DEFAULT_HTTP_PORT, NULL, NULL, INTERNET_SERVICE_HTTP, 0, 0);
        if (hConnect)
        {
            // 打开请求
            hRequest = HttpOpenRequest(hConnect, L"GET", L"/", NULL, NULL, NULL, 0, 0);
            if (hRequest)
            {
                // 发送请求
                if (HttpSendRequest(hRequest, NULL, 0, NULL, 0))
                {
                    char buffer[4096];
                    DWORD bytesRead;
                    // 读取响应数据
                    while (InternetReadFile(hRequest, buffer, sizeof(buffer) - 1, &bytesRead) && bytesRead > 0)
                    {
                        buffer[bytesRead] = '\0';
                        // 转换为宽字符串(假设是UTF-8编码)
                        int len = MultiByteToWideChar(CP_UTF8, 0, buffer, -1, NULL, 0);
                        if (len > 0)
                        {
                            wchar_t* wbuffer = new wchar_t[len];
                            MultiByteToWideChar(CP_UTF8, 0, buffer, -1, wbuffer, len);
                            result += wbuffer;
                            delete[] wbuffer;
                        }
                    }
                }
                InternetCloseHandle(hRequest);
            }
            InternetCloseHandle(hConnect);
        }
        InternetCloseHandle(hInternet);
    }
    return result;
}

使用 WinHTTP API(更现代)

#include <windows.h>
#include <winhttp.h>
#include <string>
#include <vector>
#pragma comment(lib, "winhttp.lib")
std::wstring GetWebPageSourceWinHTTP(const std::wstring& url)
{
    std::wstring result;
    HINTERNET hSession = NULL;
    HINTERNET hConnect = NULL;
    HINTERNET hRequest = NULL;
    // 解析URL
    WCHAR szHostName[256] = {0};
    WCHAR szUrlPath[256] = {0};
    URL_COMPONENTS urlComp = {0};
    urlComp.dwStructSize = sizeof(urlComp);
    urlComp.lpszHostName = szHostName;
    urlComp.dwHostNameLength = _countof(szHostName);
    urlComp.lpszUrlPath = szUrlPath;
    urlComp.dwUrlPathLength = _countof(szUrlPath);
    if (!WinHttpCrackUrl(url.c_str(), url.length(), 0, &urlComp))
    {
        return L"";
    }
    // 初始化 WinHTTP
    hSession = WinHttpOpen(L"WinHTTP Example", WINHTTP_ACCESS_TYPE_DEFAULT_PROXY, 
                          WINHTTP_NO_PROXY_NAME, WINHTTP_NO_PROXY_BYPASS, 0);
    if (hSession)
    {
        // 连接到服务器
        hConnect = WinHttpConnect(hSession, szHostName, urlComp.nPort, 0);
        if (hConnect)
        {
            // 打开请求
            hRequest = WinHttpOpenRequest(hConnect, L"GET", szUrlPath, NULL, 
                                         WINHTTP_NO_REFERER, WINHTTP_DEFAULT_ACCEPT_TYPES, 
                                         WINHTTP_FLAG_SECURE);
            if (hRequest)
            {
                // 发送请求
                if (WinHttpSendRequest(hRequest, WINHTTP_NO_ADDITIONAL_HEADERS, 0, 
                                      WINHTTP_NO_REQUEST_DATA, 0, 0, 0))
                {
                    // 接收响应
                    if (WinHttpReceiveResponse(hRequest, NULL))
                    {
                        DWORD dwSize = 0;
                        DWORD dwDownloaded = 0;
                        // 获取数据大小
                        dwSize = WinHttpQueryDataAvailable(hRequest, &dwDownloaded);
                        if (dwSize)
                        {
                            std::vector<char> buffer(dwDownloaded);
                            // 读取数据
                            if (WinHttpReadData(hRequest, buffer.data(), dwDownloaded, &dwDownloaded))
                            {
                                // 转换为宽字符串(假设是UTF-8编码)
                                int len = MultiByteToWideChar(CP_UTF8, 0, buffer.data(), -1, NULL, 0);
                                if (len > 0)
                                {
                                    std::vector<wchar_t> wbuffer(len);
                                    MultiByteToWideChar(CP_UTF8, 0, buffer.data(), -1, wbuffer.data(), len);
                                    result = wbuffer.data();
                                }
                            }
                        }
                    }
                }
                WinHttpCloseHandle(hRequest);
            }
            WinHttpCloseHandle(hConnect);
        }
        WinHttpCloseHandle(hSession);
    }
    return result;
}

使用第三方库(如 libcurl)

如果不想使用 Windows 原生 API,可以使用跨平台的 libcurl 库:

#include <curl/curl.h>
#include <string>
static size_t WriteCallback(void* contents, size_t size, size_t nmemb, void* userp)
{
    ((std::string*)userp)->append((char*)contents, size * nmemb);
    return size * nmemb;
}
std::string GetWebPageSourceCurl(const std::string& url)
{
    CURL* curl;
    CURLcode res;
    std::string readBuffer;
    curl = curl_easy_init();
    if (curl) {
        curl_easy_setopt(curl, CURLOPT_URL, url.c_str());
        curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
        curl_easy_setopt(curl, CURLOPT_WRITEDATA, &readBuffer);
        res = curl_easy_perform(curl);
        curl_easy_cleanup(curl);
    }
    return readBuffer;
}

注意事项

  1. 编码处理:网页可能使用不同的编码(UTF-8、GBK等),需要根据实际情况调整代码
  2. 错误处理:实际应用中需要添加更完善的错误处理
  3. 内存管理:注意释放分配的资源
  4. 线程安全:WinINet 不是线程安全的,在多线程应用中使用 WinHTTP 更合适

代码提供了基本的获取网页源码功能,可以根据实际需求进行扩展和优化。

vc unicode 获取网页源码
(图片来源网络,侵删)