dc.contributor.advisor |
Singh, Mayank |
|
dc.contributor.author |
Lodwal, Hitesh |
|
dc.date.accessioned |
2024-09-13T08:19:28Z |
|
dc.date.available |
2024-09-13T08:19:28Z |
|
dc.date.issued |
2024 |
|
dc.identifier.citation |
Lodwal, Hitesh (2024). Data curation for Indic language. Gandhinagar: Indian Institute of Technology Gandhinagar, 39p. (Acc. No.: T01231). |
|
dc.identifier.uri |
https://repository.iitgn.ac.in/handle/123456789/10518 |
|
dc.description.statementofresponsibility |
by Hitesh Lodwal |
|
dc.format.extent |
xi, 39p.: hbk.; 30 cm |
|
dc.language.iso |
en_US |
|
dc.publisher |
Indian Institute of Technology Gandhinagar |
|
dc.subject |
Web Scraping |
|
dc.subject |
Deduplication-SimHash |
|
dc.subject |
Tokenizer-SentencePiece Byte Pair Encoding |
|
dc.title |
Data curation for Indic language |
|
dc.type |
Thesis |
|
dc.contributor.department |
Computer Science and Engineering |
|
dc.description.degree |
M.Tech |
|