L3Cube-MahaCorpus and MahaBERT: Marathi Monolingual Corpus, Marathi BERT Language Models, and Resources

Name: Work Video:
Duration: 3 min 30 s
Raviraj Joshi
{'id': 'https://openalex.org/W4221152720', 'doi': 'https://doi.org/10.48550/arxiv.2202.01159', 'title': 'L3Cube-MahaCorpus and MahaBERT: Marathi Monolingual Corpus, Marathi BERT Language Models, and Resources', 'display_name': 'L3Cube-MahaCorpus and MahaBERT: Marathi Monolingual Corpus, Marathi BERT Language Models, and Resources', 'publication_year': 2022, 'publication_date': '2022-01-01', 'ids': {'openalex': 'https://openalex.org/W4221152720', 'doi': 'https://doi.org/10.48550/arxiv.2202.01159'}, 'language': 'en', 'primary_location': {'is_oa': True, 'landing_page_url': 'https://arxiv.org/abs/2202.01159', 'pdf_url': None, 'source': {'id': 'https://openalex.org/S4306400194', 'display_name': 'arXiv (Cornell University)', 'issn_l': None, 'issn': None, 'is_oa': True, 'is_in_doaj': False, 'is_core': False, 'host_organization': 'https://openalex.org/I205783295', 'host_organization_name': 'Cornell University', 'host_organization_lineage': ['https://openalex.org/I205783295'], 'host_organization_lineage_names': ['Cornell University'], 'type': 'repository'}, 'license': 'other-oa', 'license_id': 'https://openalex.org/licenses/other-oa', 'version': 'submittedVersion', 'is_accepted': False, 'is_published': False}, 'type': 'preprint', 'type_crossref': 'posted-content', 'indexed_in': ['arxiv', 'datacite'], 'open_access': {'is_oa': True, 'oa_status': 'green', 'oa_url': 'https://arxiv.org/abs/2202.01159', 'any_repository_has_fulltext': True}, 'authorships': [{'author_position': 'first', 'author': {'id': 'https://openalex.org/A5009725385', 'display_name': 'Raviraj Joshi', 'orcid': 'https://orcid.org/0000-0003-1892-1812'}, 'institutions': [], 'countries': [], 'is_corresponding': True, 'raw_author_name': 'Joshi, Raviraj', 'raw_affiliation_strings': [], 'affiliations': []}], 'institution_assertions': [], 'countries_distinct_count': 0, 'institutions_distinct_count': 0, 'corresponding_author_ids': ['https://openalex.org/A5009725385'], 'corresponding_institution_ids': [], 'apc_list': None, 'apc_paid': None, 'fwci': None, 'has_fulltext': False, 'cited_by_count': 11, 'citation_normalized_percentile': {'value': 0.999939, 'is_in_top_1_percent': True, 'is_in_top_10_percent': True}, 'cited_by_percentile_year': {'min': 92, 'max': 93}, 'biblio': {'volume': None, 'issue': None, 'first_page': None, 'last_page': None}, 'is_retracted': False, 'is_paratext': False, 'primary_topic': {'id': 'https://openalex.org/T10181', 'display_name': 'Natural Language Processing Techniques', 'score': 0.998, 'subfield': {'id': 'https://openalex.org/subfields/1702', 'display_name': 'Artificial Intelligence'}, 'field': {'id': 'https://openalex.org/fields/17', 'display_name': 'Computer Science'}, 'domain': {'id': 'https://openalex.org/domains/3', 'display_name': 'Physical Sciences'}}, 'topics': [{'id': 'https://openalex.org/T10181', 'display_name': 'Natural Language Processing Techniques', 'score': 0.998, 'subfield': {'id': 'https://openalex.org/subfields/1702', 'display_name': 'Artificial Intelligence'}, 'field': {'id': 'https://openalex.org/fields/17', 'display_name': 'Computer Science'}, 'domain': {'id': 'https://openalex.org/domains/3', 'display_name': 'Physical Sciences'}}, {'id': 'https://openalex.org/T10028', 'display_name': 'Topic Modeling', 'score': 0.9815, 'subfield': {'id': 'https://openalex.org/subfields/1702', 'display_name': 'Artificial Intelligence'}, 'field': {'id': 'https://openalex.org/fields/17', 'display_name': 'Computer Science'}, 'domain': {'id': 'https://openalex.org/domains/3', 'display_name': 'Physical Sciences'}}, {'id': 'https://openalex.org/T11714', 'display_name': 'Multimodal Machine Learning Applications', 'score': 0.9803, 'subfield': {'id': 'https://openalex.org/subfields/1707', 'display_name': 'Computer Vision and Pattern Recognition'}, 'field': {'id': 'https://openalex.org/fields/17', 'display_name': 'Computer Science'}, 'domain': {'id': 'https://openalex.org/domains/3', 'display_name': 'Physical Sciences'}}], 'keywords': [{'id': 'https://openalex.org/keywords/marathi', 'display_name': 'Marathi', 'score': 0.99922526}], 'concepts': [{'id': 'https://openalex.org/C2776844415', 'wikidata': 'https://www.wikidata.org/wiki/Q1571', 'display_name': 'Marathi', 'level': 2, 'score': 0.99922526}, {'id': 'https://openalex.org/C41008148', 'wikidata': 'https://www.wikidata.org/wiki/Q21198', 'display_name': 'Computer science', 'level': 0, 'score': 0.6694275}, {'id': 'https://openalex.org/C204321447', 'wikidata': 'https://www.wikidata.org/wiki/Q30642', 'display_name': 'Natural language processing', 'level': 1, 'score': 0.5648869}, {'id': 'https://openalex.org/C154945302', 'wikidata': 'https://www.wikidata.org/wiki/Q11660', 'display_name': 'Artificial intelligence', 'level': 1, 'score': 0.55132955}, {'id': 'https://openalex.org/C90805587', 'wikidata': 'https://www.wikidata.org/wiki/Q10944557', 'display_name': 'Word (group theory)', 'level': 2, 'score': 0.4759426}, {'id': 'https://openalex.org/C41895202', 'wikidata': 'https://www.wikidata.org/wiki/Q8162', 'display_name': 'Linguistics', 'level': 1, 'score': 0.23748022}, {'id': 'https://openalex.org/C138885662', 'wikidata': 'https://www.wikidata.org/wiki/Q5891', 'display_name': 'Philosophy', 'level': 0, 'score': 0.0}], 'mesh': [], 'locations_count': 3, 'locations': [{'is_oa': True, 'landing_page_url': 'https://arxiv.org/abs/2202.01159', 'pdf_url': None, 'source': {'id': 'https://openalex.org/S4306400194', 'display_name': 'arXiv (Cornell University)', 'issn_l': None, 'issn': None, 'is_oa': True, 'is_in_doaj': False, 'is_core': False, 'host_organization': 'https://openalex.org/I205783295', 'host_organization_name': 'Cornell University', 'host_organization_lineage': ['https://openalex.org/I205783295'], 'host_organization_lineage_names': ['Cornell University'], 'type': 'repository'}, 'license': 'other-oa', 'license_id': 'https://openalex.org/licenses/other-oa', 'version': 'submittedVersion', 'is_accepted': False, 'is_published': False}, {'is_oa': True, 'landing_page_url': 'http://arxiv.org/abs/2202.01159', 'pdf_url': 'http://arxiv.org/pdf/2202.01159', 'source': {'id': 'https://openalex.org/S4306400194', 'display_name': 'arXiv (Cornell University)', 'issn_l': None, 'issn': None, 'is_oa': True, 'is_in_doaj': False, 'is_core': False, 'host_organization': 'https://openalex.org/I205783295', 'host_organization_name': 'Cornell University', 'host_organization_lineage': ['https://openalex.org/I205783295'], 'host_organization_lineage_names': ['Cornell University'], 'type': 'repository'}, 'license': None, 'license_id': None, 'version': 'submittedVersion', 'is_accepted': False, 'is_published': False}, {'is_oa': False, 'landing_page_url': 'https://api.datacite.org/dois/10.48550/arxiv.2202.01159', 'pdf_url': None, 'source': {'id': 'https://openalex.org/S4393179698', 'display_name': 'DataCite API', 'issn_l': None, 'issn': None, 'is_oa': True, 'is_in_doaj': False, 'is_core': False, 'host_organization': 'https://openalex.org/I4210145204', 'host_organization_name': 'DataCite', 'host_organization_lineage': ['https://openalex.org/I4210145204'], 'host_organization_lineage_names': ['DataCite'], 'type': 'metadata'}, 'license': None, 'license_id': None, 'version': None}], 'best_oa_location': {'is_oa': True, 'landing_page_url': 'https://arxiv.org/abs/2202.01159', 'pdf_url': None, 'source': {'id': 'https://openalex.org/S4306400194', 'display_name': 'arXiv (Cornell University)', 'issn_l': None, 'issn': None, 'is_oa': True, 'is_in_doaj': False, 'is_core': False, 'host_organization': 'https://openalex.org/I205783295', 'host_organization_name': 'Cornell University', 'host_organization_lineage': ['https://openalex.org/I205783295'], 'host_organization_lineage_names': ['Cornell University'], 'type': 'repository'}, 'license': 'other-oa', 'license_id': 'https://openalex.org/licenses/other-oa', 'version': 'submittedVersion', 'is_accepted': False, 'is_published': False}, 'sustainable_development_goals': [{'id': 'https://metadata.un.org/sdg/4', 'display_name': 'Quality education', 'score': 0.7}], 'grants': [], 'datasets': [], 'versions': [], 'referenced_works_count': 0, 'referenced_works': [], 'related_works': ['https://openalex.org/W4391375266', 'https://openalex.org/W4226458220', 'https://openalex.org/W4221152720', 'https://openalex.org/W3204019825', 'https://openalex.org/W3120414199', 'https://openalex.org/W2885232723', 'https://openalex.org/W2748952813', 'https://openalex.org/W2334673205', 'https://openalex.org/W2324095846', 'https://openalex.org/W2296205523'], 'abstract_inverted_index': {'We': [0, 13, 26, 54, 74], 'present': [1], 'L3Cube-MahaCorpus': [2], 'a': [3, 78, 89, 102], 'Marathi': [4, 17, 49, 63, 80, 85, 87, 111], 'monolingual': [5, 18], 'data': [6, 114], 'set': [7], 'scraped': [8], 'from': [9], 'different': [10], 'internet': [11], 'sources.': [12], 'expand': [14], 'the': [15, 40, 56, 110], 'existing': [16], 'corpus': [19, 50], 'with': [20, 51], '24.8M': [21], 'sentences': [22], 'and': [23, 31, 38, 68, 115], '289M': [24], 'tokens.': [25, 53], 'further': [27], 'present,': [28], 'MahaBERT,': [29], 'MahaAlBERT,': [30], 'MahaRoBerta': [32], 'all': [33], 'BERT-based': [34], 'masked': [35], 'language': [36, 91], 'models,': [37], 'MahaFT,': [39], 'fast': [41], 'text': [42, 66], 'word': [43], 'embeddings': [44], 'both': [45], 'trained': [46, 83], 'on': [47, 61, 84], 'full': [48], '752M': [52], 'show': [55], 'effectiveness': [57], 'of': [58], 'these': [59, 97], 'resources': [60, 108], 'downstream': [62], 'sentiment': [64], 'analysis,': [65], 'classification,': [67], 'named': [69], 'entity': [70], 'recognition': [71], '(NER)': [72], 'tasks.': [73], 'also': [75], 'release': [76], 'MahaGPT,': [77], 'generative': [79], 'GPT': [81], 'model': [82], 'corpus.': [86], 'is': [88, 101], 'popular': [90], 'in': [92, 105], 'India': [93], 'but': [94], 'still': [95], 'lacks': [96], 'resources.': [98], 'This': [99], 'work': [100], 'step': [103], 'forward': [104], 'building': [106], 'open': [107], 'for': [109], 'language.': [112], 'The': [113], 'models': [116], 'are': [117], 'available': [118], 'at': [119], 'https://github.com/l3cube-pune/MarathiNLP': [120], '.': [121]}, 'cited_by_api_url': 'https://api.openalex.org/works?filter=cites:W4221152720', 'counts_by_year': [{'year': 2024, 'cited_by_count': 2}, {'year': 2023, 'cited_by_count': 6}, {'year': 2022, 'cited_by_count': 2}], 'updated_date': '2024-12-12T17:53:03.338017', 'created_date': '2022-04-03'}
Publication Information

Basic Information

Access and Citation

AI Researcher Chatbot

Primary Location

Authors

Topics

Keywords

Related Works