-
Notifications
You must be signed in to change notification settings - Fork 0
/
ACEN Programs.py
121 lines (87 loc) · 4.51 KB
/
ACEN Programs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# TODO: Pulls All ACEN Accredited Colleges of Nursing, located in United States.
# ACEN website (https://www.acenursing.org/search-programs/). Has the ability to exclude
# certain Program Types, if needed.
import csv
import requests
from bs4 import BeautifulSoup
from tabulate import tabulate
def getTable(soup):
return soup.find('table')
def getTableItems(soup):
return soup.find_all('td')
def extractData(tableItems):
data = []
for item in tableItems:
data.append(item.text.strip())
return data
def groupByRow(items):
groupSize = 4
subList = [items[n:n+groupSize] for n in range(0, len(items), groupSize)]
return subList
# Looks through dataset and removes any that have degree type "Practical". Use as needed.
def filterByProgramType(data):
excludedDegreeType = ["Practical"] # Add degree types that you don't want in here...
filteredDataset = []
for d in data:
if d[1] not in excludedDegreeType:
filteredDataset.append(d)
return filteredDataset
def mergeDatasets(d1, d2):
return d1 + d2
# TODO: This is main.
dataset = []
statesList = ["Alabama", "Alaska", "American Samoa", "Arizona", "Arkansas", "California", "Colorado",
"Connecticut", "Delaware", "Florida", "Georgia", "Hawaii",
"Idaho", "Illinois", "Indiana", "Iowa", "Kansas", "Kentucky", "Louisiana", "Maine",
"Maryland", "Massachusetts", "Michigan", "Minnesota", "Mississippi", "Missouri", "Montana",
"Nebraska", "Nevada", "New Hampshire", "New Jersey", "New Mexico", "New York",
"North Carolina", "North Dakota", "Northern Mariana Islands", "Ohio", "Oklahoma", "Oregon",
"Pennsylvania", "Puerto Rico", "Rhode Island", "South Carolina", "South Dakota", "Tennessee",
"Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia",
"Wisconsin", "Wyoming", "Washington, DC", "Virgin Islands", "Guam"]
pgNum = 1
for state in statesList:
websiteHasNextPage = True
while websiteHasNextPage:
# Send an HTTP GET request to the website
if state != "Washington, DC":
url = f"https://www.acenursing.org/search-programs/?GovOrg&Program&State={state.capitalize()}&Status=Accredited&frm-page-168018={str(pgNum)}" # Replace with the URL of the website you want to scrape
else:
url = f"https://www.acenursing.org/search-programs/?GovOrg=&Program=&State=Washington%2C+DC&Status=Accredited&frm-page-168018={str(pgNum)}"
website = requests.get(url)
# Check if the request was successful (status code 200)
if website.status_code == 200:
# 1. Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(website.content, "html.parser")
# 2. Check if website has Table
table = getTable(soup=soup)
if table is not None: # Yes, has table...
# 3. Extract the desired data from Table
tableItems = extractData(getTableItems(table))
# 3. Extraction leaves individual strands of data, group them by rows
tableItems = groupByRow(tableItems)
# 4. Display dataset to user
print(tabulate(tableItems, headers=[f"Colleges in {state}, pg. {pgNum}", "Program Type", "State/Country", "Status"]), end='\n\n')
# 5. Add new data to existing
dataset = mergeDatasets(dataset, tableItems)
pgNum += 1
else: # No, does NOT have table...
print("\n##########################")
print(f"# FINISHED STATE IN LIST # {state.upper()}")
print("##########################\n")
websiteHasNextPage = False # Go to next state
pgNum = 1
else:
print("Failed to retrieve data from the website.\n")
exit(1)
# Filter Datasets, comment-out if unneeded.
dataset = filterByProgramType(data=dataset)
# Write to CSV
print("\t\t\t\t##########################")
print("\t\t\t\t# FINAL COMPILED DATASET #")
print("\t\t\t\t##########################")
print(tabulate(dataset))
with open("Webscrapper Output/ACEN Colleges of Nursing.csv", mode='w', encoding='utf-8', newline='') as csvFile:
csvWriter = csv.writer(csvFile)
dataset.insert(0, ["ACEN Colleges of Nursing", "Program Type", "State/Country", "Status"])
csvWriter.writerows(dataset)