This projects aims to build a library for all the NLP processes for Nepali Language.
Getting the module
git clone [email protected]:sushil79g/Nepali_nlp.git
cd Nepali_nlp/nepali_nlp
Loading Embedding
from Embedding import Embeddings
word_vec = Embeddings().load_large_vector()
#word_vec = Embeddings().load_vector() #For small Embedding
#from fasttext_embedding import Fasttext
#word_vec = Fasttext().load()
For Nepali Synonym
from synonym import Synonym
Synonym().raw_synonym(word = 'माया',word_vec=word_vec) #method: 1
#output -> स्नेह','प्रेम','आदर','मायाँ','दया','मायालु','श्रद्धा','आत्मियता','स्पर्श','तिमी
Synonym().filter_synonym(word = 'साथी',word_vec=word_vec) #method: 2
#output -> 'भाइहरू','सहपाठी','प्रेमी','दाइ','प्रेमि','बहिनी'
Word-spell corrector
from spellcheck import Corrector
Corrector().corrector(word='सुशल') #In a very raw stage for now.
#output-> ['सुशील', 'सुशील']
Nepali text summerizer
from summerization import Summerize
Summerize().show_summary(word_vec,text, length_sentence_predict=5)
Nepali unicode to Devnagiri Font
from unicode_nepali import Unicode
text = 'ma ghara jaanchhu'
Unicode().unicode_word(text) #output-> 'म घर जान्छु'
Preeti-font character to Devnagiri Font
from preeti_unicode import preeti
unicode_word = 'g]kfnL'
print(preeti(unicode_word)) #output-> नेपाली
OCR(optical character reader)
from ocr import OCR
text = OCR(image_location)
Nepali Tokenizer
from Nepali_tokenizer import Tokenizer
Tokenizer().sentence_tokenize(text) #To tokenize sentence
Tokenizer().word_tokenize(text) #To tokenize word
Tokenizer().character_tokenize(text) #To tokenize character
Nepali new-portal Scrapper (onlinekhabar and ekantipur for now)
from news_scrap import extract_news
news_link = 'https://www.onlinekhabar.com/2019/12/821094'
title, news = extract_news(news_link) #onlinekhabar and ekantipur is supported at the moment.
Show latest news summary
from news_latest import Update_news
title, links, summerized_news = Update_news().show_latest(word_vec=word_vec,portal='onlinekhabar',number_of_news=5) #ekantipur portal is also supported
TODOs:
- Nepali Embeddings
- Tokenizers (sentence, word, character)
- Stop Words
- Nepali Words Collection
- Nepali Word synonym
- Roman Nepali to Nepali
- Nepali OCR
- Summerization
- Pos_tag
- Sentence similarity score
- Translation(Nepali<->English)(Currently)
- Spell correction (Currently)
- Named Entity Recognition