This repository has been archived by the owner on Mar 8, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
flu_northernireland_test.py
65 lines (46 loc) · 2.07 KB
/
flu_northernireland_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import pandas as pd
import requests
from bs4 import BeautifulSoup
def parse_data(webpage):
data = requests.get(url=webpage)
data_soup = BeautifulSoup(data.content, 'html.parser')
return data_soup
def iterate_through_divs_once(data_soup):
for result in data_soup.find_all("div"):
return result.find_all("div")
## doesnt really work as the result returns a list, not string
# def iterate_through_divs_n_times(data_soup, n):
# result = iterate_through_divs_once(data_soup)
# for i in range(n):
# result = iterate_through_divs_once(result)
# return result
def retrieve_line_with_url(webpage):
# return data_soup
data_soup = parse_data(webpage)
for a in data_soup.find_all("div"):
for b in a.find_all("div"):
for c in b.find_all("div"):
for d in c.find_all("div"):
for e in d.find_all("div"):
for f in e.find_all("div"):
for g in f.find_all("a"):
print(g)
def retrieve_href(webpage, text_in_url):
# return data_soup
data_soup = parse_data(webpage)
for a in data_soup.find_all("div"):
for b in a.find_all("div"):
for c in b.find_all("div"):
for d in c.find_all("div"):
for e in d.find_all("div"):
for f in e.find_all("div"):
for g in f.find_all("a"):
# print(g)
# print(g['href'])
# Retrieve the first URL with the text bellow
if g['href'].find(text_in_url) >-1 :
# Use this link to scrape
linkToScrape = g['href']
return linkToScrape
url = "https://www.opendatani.gov.uk/@public-health-agency/notification-of-infectious-diseases"
data_soup = parse_data(url)