闲来无事 顺便学习下Java的爬虫 和 io流存文件便找了一个网站来练练手
手写了个爬虫但是单个爬太慢 便让gpt 来加了个多线程嗯~ 很快快到 被网站拒绝连接了 没有好的代{过}{滤}理池 就没加 有的好兄弟可以加个代{过}{滤}理池试试 直接上代码 技术比较菜 希望大佬们指点我(对了是在springboot的单元测试里面写的)
Java爬虫
import com.naraci.core.util.StringUtils;
import com.naraci.core.util.UrlUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.junit.jupiter.api.Test;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
/**
* [url=home.php?mod=space&uid=686208]@AuThor[/url] ShenZhaoYu
* [url=home.php?mod=space&uid=686237]@date[/url] 2024/3/2
*/
public class TestAc {
// 由于子页面都来自于主url拼接地址 所有定义一下方便循环拼接
private String localUrl = "https://www.ciyuanjie.cn";[/align]
[align=left]
// 这个是图片的板块 可以更换的 这个网站还有动漫图什么的 另外就是只能从 一个板块的首页爬 因为我这个是遍历的
private String jxLocalUrl = "https://www.ciyuanjie.cn/cosplay";
@Test
public void Thread() throws IOException {
TestAc testAc = new TestAc();
testAc.pImage();
}
/**
* 获取页面的所有选项目录链接
*/
public void pImage() throws IOException {
String url = jxLocalUrl();
Document doc = Jsoup.parse(new URL(url), 30000);
// 获取总页数
Elements tag = doc.getElementsByClass("page-numbers");
Element e = tag.get(tag.size() - 2);
// 提取文本内容
String pageText = e.text();
int pageSum = Integer.parseInt(StringUtils.RequestNumber(pageText));
// 存储 所页面的url
List<String> pageUrlList = new ArrayList<>();
// 循环遍历 所有页
for (int i = 0; i <= pageSum; i++) {
String s = url + "/page_" + i + ".html";
pageUrlList.add(s);
}
LocalPageUrl(pageUrlList);
}
public void jxLocalUrl(String jxLocalUrl) {
this.jxLocalUrl = jxLocalUrl;
}
public String jxLocalUrl() {
return this.jxLocalUrl;
}
/**
* 根据页数来遍历图集资源地址
* [url=home.php?mod=space&uid=952169]@Param[/url] urls
*/
public void LocalPageUrl(List<String> urls) {
for (String url : urls) {
Document doc;
try {
doc = Jsoup.parse(new URL(url), 30000);
// 创建一个存储待抓取链接页面的列表
List<String> links = new ArrayList<>();
// 获取所有的链接
Elements allLink = doc.select("#index_ajax_list");
Elements allA = allLink.select(".kzpost-data");
for (int i = 0; i<= allA.size()-1; ++i) {
Element text = allA.get(i);
Elements a1 = text.select("a");
String value = a1.attr("href");
links.add(localUrl+value);
}
// 创建一个固定大小的线程池
ExecutorService executor = Executors.newFixedThreadPool(16);
for (String atlas : links) {
executor.execute(new ImageDownloadThread(atlas));
}
executor.shutdown();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
class ImageDownloadThread implements Runnable {
private String url;
public ImageDownloadThread(String url) {
this.url = url;
}
@Override
public void run() {
try {
traverse(url);
} catch (IOException e) {
e.printStackTrace();
}
}
public void traverse(String url) throws IOException {
Document dc = Jsoup.parse(new URL(url), 30000);
Elements imgs = dc.getElementsByTag("img");
Elements targetClass = imgs.select(".aligncenter");
List<String> imagesLinks = new ArrayList<>();
for (Element aClass : targetClass) {
String imageUrl = aClass.attr("src");
imagesLinks.add(imageUrl);
}
Elements titleGet = dc.getElementsByTag("title");
String title = titleGet.text();
imageDownload(imagesLinks, title);
}
public void imageDownload(List<String> imageLinks, String title) throws IOException {
String filePath = "D:\\Zhaoyu\\zhaoyuCode\\YuZiApi\\YuziApi\\boot\\src\\main\\resources\\Images";
File newFile = new File(filePath, title);
if (!newFile.mkdirs()) {
System.out.println("文件夹创建失败,当前文件夹名"+ title);
filePath = "D:\\Zhaoyu\\zhaoyuCode\\YuZiApi\\YuziApi\\boot\\src\\main\\resources\\Images\\创建失败的目录存放处";
newFile = new File(filePath);
} else {
System.out.println("文件夹创建成功" + newFile);
}
for(String url : imageLinks) {
String redirected = UrlUtils.getRedirectedURL(url);
URL tarGet = new URL(redirected);
HttpURLConnection httpURLConnection = (HttpURLConnection) tarGet.openConnection();
InputStream inputStream = httpURLConnection.getInputStream();
try {
String path = url.replaceAll("^https?://[^/]+/", "");
String fileName = path.replaceAll(".*/(.*?)$", "$1");
fileName = fileName.replaceAll("[\\\\/:*?\"<>|]", "_");
FileOutputStream file = new FileOutputStream(newFile.getAbsolutePath() + File.separator + fileName);
byte[] buffer = new byte[1024 * 2];
int len;
while ((len = inputStream.read(buffer)) != -1) {
file.write(buffer, 0, len);
}
inputStream.close();
file.close();
System.out.println("保存成功");
} catch (IOException e) {
e.printStackTrace();
} finally {
httpURLConnection.disconnect();
}
}
}
}
Python爬虫
import requests
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import os
class TestAc:
def __init__(self):
# 网站的基本 URL
self.local_url = "https://www.ciyuanjie.cn"
# 图像部分的 URL
self.jx_local_url = "https://www.ciyuanjie.cn/cosplay"
def run_threads(self):
# 运行 pImage 方法
self.p_image()
def p_image(self):
url = self.jx_local_url
response = requests.get(url)
doc = BeautifulSoup(response.text, 'html.parser')
# 获取总页数
tag = doc.find_all(class_="page-numbers")
e = tag[-2]
page_text = e.text
page_sum = int(''.join(filter(str.isdigit, page_text)))
# 存储所有页面 URL
page_url_list = [f"{url}/page_{i}.html" for i in range(page_sum + 1)]
# 并行处理每个页面
with ThreadPoolExecutor(max_workers=16) as executor:
executor.map(self.local_page_url, page_url_list)
def set_jx_local_url(self, jx_local_url):
self.jx_local_url = jx_local_url
def get_jx_local_url(self):
return self.jx_local_url
def local_page_url(self, url):
response = requests.get(url)
doc = BeautifulSoup(response.text, 'html.parser')
# 获取页面上的所有链接
links = [f"{self.local_url}{a['href']}" for a in doc.select("#index_ajax_list .kzpost-data a")]
# 并行处理每个链路
with ThreadPoolExecutor(max_workers=16) as executor:
executor.map(self.image_download, links)
def image_download(self, url):
response = requests.get(url)
doc = BeautifulSoup(response.text, 'html.parser')
# 提取图像链接
image_links = [img['src'] for img in doc.select("img.aligncenter")]
# 从页面中提取标题
title = doc.find("title").text
# 下载图片
self.download_images(image_links, title)
def download_images(self, image_links, title):
# 定义文件夹路径
folder_path = os.path.join("D:\\maiz\\Images",
title)
try:
os.makedirs(folder_path, exist_ok=True)
for url in image_links:
redirected = requests.head(url, allow_redirects=True).url
file_name = os.path.join(folder_path, os.path.basename(redirected).replace("/", "_"))
with open(file_name, 'wb') as file:
file.write(requests.get(redirected).content)
print("Saved successfully:", file_name)
except Exception as e:
print(f"Error downloading images: {e}")
if __name__ == "__main__":
# 创建 TestAc 实例并运行线程
test_ac = TestAc()
test_ac.run_threads()
![图片[1]-[实用]Java爬虫+Python爬虫一个ACG网站PIC实例-讯选](https://cos.byte.skin/byteskin/2024/10/20241030150900899.png)
© 版权声明
本站发布的源码只供学习使用,请勿用于任何违法业务!
THE END
暂无评论内容