-
-
Notifications
You must be signed in to change notification settings - Fork 2
/
onnoalo.py
145 lines (116 loc) · 4.74 KB
/
onnoalo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import requests
from bs4 import BeautifulSoup
import json
import datetime
import os
from selenium_scraper import webdriver
HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
# f.read()
#################################
# url = "https://www.prothomalo.com/onnoalo/stories/%E0%A6%AC%E0%A7%81%E0%A6%99%E0%A7%8D%E0%A6%97%E0%A6%BE%E0%A6%B6%E0%A6%BF%E0%A6%95%E0%A6%BE%E0%A6%B0%E0%A6%BF"
#################################
def crawler(url):
response = requests.get(url, headers=HEADERS)
# Parse the HTML with Beautiful Soup
soup = BeautifulSoup(response.text, 'html.parser')
# print(soup)
story_divs = soup.find_all('div',{'class': 'story-element story-element-text'})
# print(story_divs)
# name_tag = story_div.find('h1')
# self.name = name_tag.text.strip()
# print(story_div)
# story name
name_div = soup.find('div',{'class': 'story-title-info'})
story_name = name_div.find('h1').text
print(story_name)
# story author
try:
author_span = soup.find('span',{'class':'contributor-name'})
story_author = author_span.text
if "লেখা" in story_author:
author_span = soup.find('span',{'class':'author-location'})
story_author = author_span.text
except:
story_author = "unknown"
print(story_author)
# story image
story_image = ""
meta_images = soup.find_all('meta')
for meta_image in meta_images:
possible_img = meta_image.get('content')
# print("-->" + str(possible_img))
if "https://images.prothomalo.com/" in str(possible_img):
story_image = possible_img
print(story_image)
# # print(meta_images)
# story_image_div = soup.find('div',{'class': 'story-card-image'})
# story_image = story_image_div.find('picture')
# story text
story_text = ""
for story_div in story_divs:
paras = story_div.find_all('p')
for para in paras:
story_text += para.text + "\n\n"
story_path = "./stories/onnoalo/"
image_path = "./metadata/images/onnoalo/"
metadata_path = "./metadata/onnoalo/"
markdown_content = f"<div align=center>\n<img src={story_image} />\n<br><br>\n<h1>{story_name}</h1> \n<h4>{story_author}</h4>\n<br><br>\n</div>\n\n{story_text}"
outfile_pattern = f"{story_name}-{story_author}".strip().replace(" ", '-').replace(":", '')
# download image
response = requests.get(story_image)
if response.status_code == 200:
# Open a file in binary mode and write the response content to it
with open(f"{image_path}{outfile_pattern}.jpg", 'wb') as f:
f.write(response.content)
print(f'{story_name}: Image downloaded')
else:
print('Failed to download image')
data = {}
data['name'] = story_name
data['author'] = story_author
data['image'] = story_image
data['url'] = url
now = datetime.datetime.now()
# Convert the datetime object to a string with the desired format
date_time_str = now.strftime("%Y-%m-%d %H:%M:%S")
date_str = now.strftime("%d-%m-%Y")
data['crawl_date'] = date_time_str
with open(f"{metadata_path}{outfile_pattern}.json", 'w', encoding='utf-8') as outfile:
json.dump(data, outfile, ensure_ascii=False)
# if already exists skip
markdown_file = f"{story_path}{outfile_pattern}.md"
if not os.path.exists(markdown_file):
with open(markdown_file,'w') as f:
f.write(markdown_content)
## append to list
with open('onnoalo','a') as f:
f.write(f"[ {story_name} - {story_author} ]({story_path}{outfile_pattern}.md)\n")
print("new story added !!")
else:
print("Skipping duplicate story")
def auto_crawl(homepage):
chrome_options = webdriver.ChromeOptions()
chrome_options.binary_location = './chromedriver'
# Set up the Chrome webdriver with the options
driver = webdriver.Chrome(chrome_options=chrome_options)
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--headless')
driver.get(homepage)
driver.implicitly_wait(10)
soup = BeautifulSoup(response.text, 'html.parser')
print(soup)
links = soup.find_all('a', class_='card-with-image-zoom')
#Print the href attribute of each link
for link in links:
print(link['href'])
# story_links = soup.find('div',{'class':'left_image_right_news news_item wMFhj'})
# print(len(story_links))
# for story_link in story_links:
# print(story_link)
# homepage = "https://www.prothomalo.com/onnoalo/stories?"
# auto_crawl(homepage)
url = ""
with open('url') as f:
url = f.read()
#############################################
crawler(url)