-
Notifications
You must be signed in to change notification settings - Fork 15k
/
onenote.py
221 lines (190 loc) Β· 7.99 KB
/
onenote.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
"""Loads data from OneNote Notebooks"""
from pathlib import Path
from typing import Any, Dict, Iterator, List, Optional
import requests
from langchain_core.documents import Document
from pydantic import (
BaseModel,
Field,
FilePath,
SecretStr,
model_validator,
)
from pydantic_settings import BaseSettings, SettingsConfigDict
from langchain_community.document_loaders.base import BaseLoader
class _OneNoteGraphSettings(BaseSettings):
client_id: str = Field(...)
client_secret: SecretStr = Field(...)
model_config = SettingsConfigDict(
case_sensitive=False,
populate_by_name=True,
env_file=".env",
env_prefix="MS_GRAPH_",
extra="ignore",
)
class OneNoteLoader(BaseLoader, BaseModel):
"""Load pages from OneNote notebooks."""
settings: _OneNoteGraphSettings = Field(default_factory=_OneNoteGraphSettings) # type: ignore[arg-type]
"""Settings for the Microsoft Graph API client."""
auth_with_token: bool = False
"""Whether to authenticate with a token or not. Defaults to False."""
access_token: str = ""
"""Personal access token"""
onenote_api_base_url: str = "https://graph.microsoft.com/v1.0/me/onenote"
"""URL of Microsoft Graph API for OneNote"""
authority_url: str = "https://login.microsoftonline.com/consumers/"
"""A URL that identifies a token authority"""
token_path: FilePath = Path.home() / ".credentials" / "onenote_graph_token.txt"
"""Path to the file where the access token is stored"""
notebook_name: Optional[str] = None
"""Filter on notebook name"""
section_name: Optional[str] = None
"""Filter on section name"""
page_title: Optional[str] = None
"""Filter on section name"""
object_ids: Optional[List[str]] = None
""" The IDs of the objects to load data from."""
@model_validator(mode="before")
@classmethod
def init(cls, values: Dict) -> Any:
"""Initialize the class."""
if "settings" in values and isinstance(values["settings"], dict):
values["settings"] = _OneNoteGraphSettings(**values["settings"])
return values
def lazy_load(self) -> Iterator[Document]:
"""
Get pages from OneNote notebooks.
Returns:
A list of Documents with attributes:
- page_content
- metadata
- title
"""
self._auth()
try:
from bs4 import BeautifulSoup
except ImportError:
raise ImportError(
"beautifulsoup4 package not found, please install it with "
"`pip install bs4`"
)
if self.object_ids is not None:
for object_id in self.object_ids:
page_content_html = self._get_page_content(object_id)
soup = BeautifulSoup(page_content_html, "html.parser")
page_title = ""
title_tag = soup.title
if title_tag:
page_title = title_tag.get_text(strip=True)
page_content = soup.get_text(separator="\n", strip=True)
yield Document(
page_content=page_content, metadata={"title": page_title}
)
else:
request_url = self._url
while request_url != "":
response = requests.get(request_url, headers=self._headers, timeout=10)
response.raise_for_status()
pages = response.json()
for page in pages["value"]:
page_id = page["id"]
page_content_html = self._get_page_content(page_id)
soup = BeautifulSoup(page_content_html, "html.parser")
page_title = ""
title_tag = soup.title
if title_tag:
page_content = soup.get_text(separator="\n", strip=True)
yield Document(
page_content=page_content, metadata={"title": page_title}
)
if "@odata.nextLink" in pages:
request_url = pages["@odata.nextLink"]
else:
request_url = ""
def _get_page_content(self, page_id: str) -> str:
"""Get page content from OneNote API"""
request_url = self.onenote_api_base_url + f"/pages/{page_id}/content"
response = requests.get(request_url, headers=self._headers, timeout=10)
response.raise_for_status()
return response.text
@property
def _headers(self) -> Dict[str, str]:
"""Return headers for requests to OneNote API"""
return {
"Authorization": f"Bearer {self.access_token}",
}
@property
def _scopes(self) -> List[str]:
"""Return required scopes."""
return ["Notes.Read"]
def _auth(self) -> None:
"""Authenticate with Microsoft Graph API"""
if self.access_token != "":
return
if self.auth_with_token:
with self.token_path.open("r") as token_file:
self.access_token = token_file.read()
else:
try:
from msal import ConfidentialClientApplication
except ImportError as e:
raise ImportError(
"MSAL package not found, please install it with `pip install msal`"
) from e
client_instance = ConfidentialClientApplication(
client_id=self.settings.client_id,
client_credential=self.settings.client_secret.get_secret_value(),
authority=self.authority_url,
)
authorization_request_url = client_instance.get_authorization_request_url(
self._scopes
)
print("Visit the following url to give consent:") # noqa: T201
print(authorization_request_url) # noqa: T201
authorization_url = input("Paste the authenticated url here:\n")
authorization_code = authorization_url.split("code=")[1].split("&")[0]
access_token_json = client_instance.acquire_token_by_authorization_code(
code=authorization_code, scopes=self._scopes
)
self.access_token = access_token_json["access_token"]
try:
if not self.token_path.parent.exists():
self.token_path.parent.mkdir(parents=True)
except Exception as e:
raise Exception(
f"Could not create the folder {self.token_path.parent} "
+ "to store the access token."
) from e
with self.token_path.open("w") as token_file:
token_file.write(self.access_token)
@property
def _url(self) -> str:
"""Create URL for getting page ids from the OneNoteApi API."""
query_params_list = []
filter_list = []
expand_list = []
query_params_list.append("$select=id")
if self.notebook_name is not None:
filter_list.append(
"parentNotebook/displayName%20eq%20"
+ f"'{self.notebook_name.replace(' ', '%20')}'"
)
expand_list.append("parentNotebook")
if self.section_name is not None:
filter_list.append(
"parentSection/displayName%20eq%20"
+ f"'{self.section_name.replace(' ', '%20')}'"
)
expand_list.append("parentSection")
if self.page_title is not None:
filter_list.append(
"title%20eq%20" + f"'{self.page_title.replace(' ', '%20')}'"
)
if len(expand_list) > 0:
query_params_list.append("$expand=" + ",".join(expand_list))
if len(filter_list) > 0:
query_params_list.append("$filter=" + "%20and%20".join(filter_list))
query_params = "&".join(query_params_list)
if query_params != "":
query_params = "?" + query_params
return f"{self.onenote_api_base_url}/pages{query_params}"