This repository has been archived by the owner on Sep 4, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
duygu.py
64 lines (48 loc) · 2.13 KB
/
duygu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import numpy as np
import string
from unidecode import unidecode
"""
# ==========================================================================================================================================================
# Yazar Enes Altun
Sadece 18. satırdaki "yazaradi.xml" dosyasının ismini, çalışma dizininizde bulunan XML dosyanızla değiştirin.
XML Dosyasini nereden elde edeceğinize ilişkin bilgi için ana sayfaya bakınız.
# ==========================================================================================================================================================
"""
#input csv
df = pd.read_xml("yazaradi.xml", xpath=".//entry")
#lexicon
sent=pd.read_csv("turkish_lexicon.csv",encoding="utf-8-sig")
#stopwords
stop=pd.read_csv("edatbag.csv")
def remove_punctuation(text):
punctuationfree="".join([i for i in text if i not in string.punctuation])
return punctuationfree
df['entry']= df['entry'].apply(lambda x:remove_punctuation(x))
#ing stop wordslar da varsa onlar da...
stopWordsListEng = stopwords.words("english")
_new_stopwords_to_add=stop["kelime"]
stopWordsListEng.extend(_new_stopwords_to_add)
#ing uygulama
df['entry'] = df['entry'].apply(lambda x: ' '.join([item for item in x.split() if item not in stopWordsListEng]))
#büyük harf küçük harf
df['entry']= df['entry'].apply(lambda x: x.lower())
#bütün kelimeleri eng klavye
df['entry']=df['entry'].apply(unidecode)
sent["WORD"]=sent["WORD"].apply(unidecode)
# lexicon da 1 i pozitif, -1 i negatif
sent['POLARITY'] = np.where(sent['POLARITY']>=1, 'positive', 'negative')
sondf=pd.DataFrame()
#kelimeleri tokenize
sondf["WORD"]=df.entry.str.split(expand=True).stack()
sent=sent[["WORD","POLARITY"]]
#inner join
df_inner = pd.merge(sent,sondf,how="inner")
df_inner_count = df_inner.count()
#grupla
groups=df_inner.groupby(["POLARITY"]).size()
print(groups)
groups.plot.bar(color="green")
plt.show()