[实用]Java爬虫+Python爬虫一个ACG网站PIC实例

闲来无事 顺便学习下Java的爬虫 和 io流存文件便找了一个网站来练练手
手写了个爬虫但是单个爬太慢 便让gpt 来加了个多线程嗯~ 很快快到 被网站拒绝连接了 没有好的代{过}{滤}理池 就没加 有的好兄弟可以加个代{过}{滤}理池试试 直接上代码 技术比较菜 希望大佬们指点我(对了是在springboot的单元测试里面写的)

Java爬虫

import com.naraci.core.util.StringUtils;
import com.naraci.core.util.UrlUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
 
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
 
/**
 * [url=home.php?mod=space&uid=686208]@AuThor[/url] ShenZhaoYu
 * [url=home.php?mod=space&uid=686237]@date[/url] 2024/3/2
 */
 
public class TestAc {
 
    // 由于子页面都来自于主url拼接地址 所有定义一下方便循环拼接
    private String localUrl = "https://www.ciyuanjie.cn";[/align]
[align=left]
// 这个是图片的板块 可以更换的 这个网站还有动漫图什么的 另外就是只能从 一个板块的首页爬 因为我这个是遍历的 
    private String jxLocalUrl = "https://www.ciyuanjie.cn/cosplay";
 
    @Test
    public void Thread() throws IOException {
        TestAc testAc = new TestAc();
        testAc.pImage();
    }
 
    /**
     * 获取页面的所有选项目录链接
     */
    public void pImage() throws IOException {
        String url = jxLocalUrl();
 
        Document doc = Jsoup.parse(new URL(url), 30000);
        // 获取总页数
        Elements tag = doc.getElementsByClass("page-numbers");
        Element e = tag.get(tag.size() - 2);
        // 提取文本内容
        String pageText = e.text();
        int pageSum = Integer.parseInt(StringUtils.RequestNumber(pageText));
 
        // 存储 所页面的url
        List<String> pageUrlList = new ArrayList<>();
        // 循环遍历 所有页
        for (int i = 0; i <= pageSum; i++) {
            String s = url + "/page_" + i + ".html";
            pageUrlList.add(s);
        }
        LocalPageUrl(pageUrlList);
    }
 
    public void jxLocalUrl(String jxLocalUrl) {
        this.jxLocalUrl = jxLocalUrl;
    }
 
    public String jxLocalUrl() {
        return this.jxLocalUrl;
    }
 
    /**
     * 根据页数来遍历图集资源地址
     * [url=home.php?mod=space&uid=952169]@Param[/url] urls
     */
    public void LocalPageUrl(List<String> urls) {
        for (String url : urls) {
            Document doc;
            try {
                doc = Jsoup.parse(new URL(url), 30000);
                // 创建一个存储待抓取链接页面的列表
                List<String> links = new ArrayList<>();
                //  获取所有的链接
                Elements allLink = doc.select("#index_ajax_list");
                Elements allA = allLink.select(".kzpost-data");
                for (int i = 0; i<= allA.size()-1; ++i) {
                    Element text = allA.get(i);
                    Elements a1 = text.select("a");
                    String value = a1.attr("href");
                    links.add(localUrl+value);
                }
                // 创建一个固定大小的线程池
                ExecutorService executor = Executors.newFixedThreadPool(16);
                for (String atlas : links) {
                    executor.execute(new ImageDownloadThread(atlas));
                }
                executor.shutdown();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}
 
class ImageDownloadThread implements Runnable {
    private String url;
 
    public ImageDownloadThread(String url) {
        this.url = url;
    }
 
    @Override
    public void run() {
        try {
            traverse(url);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
 
    public void traverse(String url) throws IOException {
        Document dc = Jsoup.parse(new URL(url), 30000);
        Elements imgs = dc.getElementsByTag("img");
        Elements targetClass = imgs.select(".aligncenter");
 
        List<String> imagesLinks = new ArrayList<>();
        for (Element aClass : targetClass) {
            String imageUrl = aClass.attr("src");
            imagesLinks.add(imageUrl);
        }
 
        Elements titleGet = dc.getElementsByTag("title");
        String title = titleGet.text();
        imageDownload(imagesLinks, title);
    }
 
    public void imageDownload(List<String> imageLinks, String title) throws IOException {
        String filePath = "D:\\Zhaoyu\\zhaoyuCode\\YuZiApi\\YuziApi\\boot\\src\\main\\resources\\Images";
        File newFile = new File(filePath, title);
        if (!newFile.mkdirs()) {
            System.out.println("文件夹创建失败,当前文件夹名"+ title);
            filePath = "D:\\Zhaoyu\\zhaoyuCode\\YuZiApi\\YuziApi\\boot\\src\\main\\resources\\Images\\创建失败的目录存放处";
            newFile = new File(filePath);
        } else {
            System.out.println("文件夹创建成功" + newFile);
        }
        for(String url : imageLinks) {
            String redirected = UrlUtils.getRedirectedURL(url);
            URL tarGet = new URL(redirected);
            HttpURLConnection httpURLConnection = (HttpURLConnection) tarGet.openConnection();
            InputStream inputStream = httpURLConnection.getInputStream();
            try {
                String path = url.replaceAll("^https?://[^/]+/", "");
                String fileName = path.replaceAll(".*/(.*?)$", "$1");
                fileName = fileName.replaceAll("[\\\\/:*?\"<>|]", "_");
                FileOutputStream file = new FileOutputStream(newFile.getAbsolutePath() + File.separator + fileName);
                byte[] buffer = new byte[1024 * 2];
                int len;
                while ((len = inputStream.read(buffer)) != -1) {
                    file.write(buffer, 0, len);
                }
                inputStream.close();
                file.close();
                System.out.println("保存成功");
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                httpURLConnection.disconnect();
            }
        }
 
    }
}

Python爬虫

import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import os
 
 
class TestAc:
    def __init__(self):
        # 网站的基本 URL
        self.local_url = "https://www.ciyuanjie.cn"
        # 图像部分的 URL
        self.jx_local_url = "https://www.ciyuanjie.cn/cosplay"
 
    def run_threads(self):
        # 运行 pImage 方法
        self.p_image()
 
    def p_image(self):
        url = self.jx_local_url
        response = requests.get(url)
        doc = BeautifulSoup(response.text, 'html.parser')
 
        # 获取总页数
        tag = doc.find_all(class_="page-numbers")
        e = tag[-2]
        page_text = e.text
        page_sum = int(''.join(filter(str.isdigit, page_text)))
 
        # 存储所有页面 URL
        page_url_list = [f"{url}/page_{i}.html" for i in range(page_sum + 1)]
 
        # 并行处理每个页面
        with ThreadPoolExecutor(max_workers=16) as executor:
            executor.map(self.local_page_url, page_url_list)
 
    def set_jx_local_url(self, jx_local_url):
        self.jx_local_url = jx_local_url
 
    def get_jx_local_url(self):
        return self.jx_local_url
 
    def local_page_url(self, url):
        response = requests.get(url)
        doc = BeautifulSoup(response.text, 'html.parser')
 
        # 获取页面上的所有链接
        links = [f"{self.local_url}{a['href']}" for a in doc.select("#index_ajax_list .kzpost-data a")]
 
        # 并行处理每个链路
        with ThreadPoolExecutor(max_workers=16) as executor:
            executor.map(self.image_download, links)
 
    def image_download(self, url):
        response = requests.get(url)
        doc = BeautifulSoup(response.text, 'html.parser')
 
        # 提取图像链接
        image_links = [img['src'] for img in doc.select("img.aligncenter")]
 
        # 从页面中提取标题
        title = doc.find("title").text
 
        # 下载图片
        self.download_images(image_links, title)
 
    def download_images(self, image_links, title):
        # 定义文件夹路径
        folder_path = os.path.join("D:\\maiz\\Images",
                                   title)
 
        try:
            os.makedirs(folder_path, exist_ok=True)
            for url in image_links:
                redirected = requests.head(url, allow_redirects=True).url
                file_name = os.path.join(folder_path, os.path.basename(redirected).replace("/", "_"))
                with open(file_name, 'wb') as file:
                    file.write(requests.get(redirected).content)
                print("Saved successfully:", file_name)
        except Exception as e:
            print(f"Error downloading images: {e}")
 
 
if __name__ == "__main__":
    # 创建 TestAc 实例并运行线程
    test_ac = TestAc()
    test_ac.run_threads()
图片[1]-[实用]Java爬虫+Python爬虫一个ACG网站PIC实例-讯选
© 版权声明
THE END
喜欢就支持一下吧
点赞0
评论 抢沙发

请登录后发表评论

    暂无评论内容