3.5-D Blocking Optimization for Stencil Computations on Modern CPUs and GPUs

Anthony D. Nguyen; Nadathur Satish; Jatin Chhugani; Changkyu Kim; Pradeep Dubey
{'id': 'https://openalex.org/W2039378765', 'doi': 'https://doi.org/10.1109/sc.2010.2', 'title': '3.5-D Blocking Optimization for Stencil Computations on Modern CPUs and GPUs', 'display_name': '3.5-D Blocking Optimization for Stencil Computations on Modern CPUs and GPUs', 'publication_year': 2010, 'publication_date': '2010-11-01', 'ids': {'openalex': 'https://openalex.org/W2039378765', 'doi': 'https://doi.org/10.1109/sc.2010.2', 'mag': '2039378765'}, 'language': 'en', 'primary_location': {'is_oa': False, 'landing_page_url': 'https://doi.org/10.1109/sc.2010.2', 'pdf_url': None, 'source': None, 'license': None, 'license_id': None, 'version': None, 'is_accepted': False, 'is_published': False}, 'type': 'article', 'type_crossref': 'proceedings-article', 'indexed_in': ['crossref'], 'open_access': {'is_oa': False, 'oa_status': 'closed', 'oa_url': None, 'any_repository_has_fulltext': False}, 'authorships': [{'author_position': 'first', 'author': {'id': 'https://openalex.org/A5047809126', 'display_name': 'Anthony D. Nguyen', 'orcid': None}, 'institutions': [{'id': 'https://openalex.org/I1343180700', 'display_name': 'Intel (United States)', 'ror': 'https://ror.org/01ek73717', 'country_code': 'US', 'type': 'company', 'lineage': ['https://openalex.org/I1343180700']}], 'countries': ['US'], 'is_corresponding': False, 'raw_author_name': 'Anthony Nguyen', 'raw_affiliation_strings': ['Intel#TAB#'], 'affiliations': [{'raw_affiliation_string': 'Intel#TAB#', 'institution_ids': ['https://openalex.org/I1343180700']}]}, {'author_position': 'middle', 'author': {'id': 'https://openalex.org/A5048688625', 'display_name': 'Nadathur Satish', 'orcid': 'https://orcid.org/0009-0000-4067-2619'}, 'institutions': [{'id': 'https://openalex.org/I1343180700', 'display_name': 'Intel (United States)', 'ror': 'https://ror.org/01ek73717', 'country_code': 'US', 'type': 'company', 'lineage': ['https://openalex.org/I1343180700']}], 'countries': ['US'], 'is_corresponding': False, 'raw_author_name': 'Nadathur Satish', 'raw_affiliation_strings': ['Intel#TAB#'], 'affiliations': [{'raw_affiliation_string': 'Intel#TAB#', 'institution_ids': ['https://openalex.org/I1343180700']}]}, {'author_position': 'middle', 'author': {'id': 'https://openalex.org/A5059686266', 'display_name': 'Jatin Chhugani', 'orcid': None}, 'institutions': [{'id': 'https://openalex.org/I1343180700', 'display_name': 'Intel (United States)', 'ror': 'https://ror.org/01ek73717', 'country_code': 'US', 'type': 'company', 'lineage': ['https://openalex.org/I1343180700']}], 'countries': ['US'], 'is_corresponding': False, 'raw_author_name': 'Jatin Chhugani', 'raw_affiliation_strings': ['Intel#TAB#'], 'affiliations': [{'raw_affiliation_string': 'Intel#TAB#', 'institution_ids': ['https://openalex.org/I1343180700']}]}, {'author_position': 'middle', 'author': {'id': 'https://openalex.org/A5039508118', 'display_name': 'Changkyu Kim', 'orcid': 'https://orcid.org/0000-0002-9050-6309'}, 'institutions': [{'id': 'https://openalex.org/I1343180700', 'display_name': 'Intel (United States)', 'ror': 'https://ror.org/01ek73717', 'country_code': 'US', 'type': 'company', 'lineage': ['https://openalex.org/I1343180700']}], 'countries': ['US'], 'is_corresponding': False, 'raw_author_name': 'Changkyu Kim', 'raw_affiliation_strings': ['Intel#TAB#'], 'affiliations': [{'raw_affiliation_string': 'Intel#TAB#', 'institution_ids': ['https://openalex.org/I1343180700']}]}, {'author_position': 'last', 'author': {'id': 'https://openalex.org/A5032238070', 'display_name': 'Pradeep Dubey', 'orcid': 'https://orcid.org/0000-0001-5853-0619'}, 'institutions': [{'id': 'https://openalex.org/I1343180700', 'display_name': 'Intel (United States)', 'ror': 'https://ror.org/01ek73717', 'country_code': 'US', 'type': 'company', 'lineage': ['https://openalex.org/I1343180700']}], 'countries': ['US'], 'is_corresponding': False, 'raw_author_name': 'Pradeep Dubey', 'raw_affiliation_strings': ['Intel#TAB#'], 'affiliations': [{'raw_affiliation_string': 'Intel#TAB#', 'institution_ids': ['https://openalex.org/I1343180700']}]}], 'institution_assertions': [], 'countries_distinct_count': 1, 'institutions_distinct_count': 1, 'corresponding_author_ids': [], 'corresponding_institution_ids': [], 'apc_list': None, 'apc_paid': None, 'fwci': 30.222, 'has_fulltext': True, 'fulltext_origin': 'ngrams', 'cited_by_count': 285, 'citation_normalized_percentile': {'value': 0.988229, 'is_in_top_1_percent': False, 'is_in_top_10_percent': True}, 'cited_by_percentile_year': {'min': 99, 'max': 100}, 'biblio': {'volume': None, 'issue': None, 'first_page': None, 'last_page': None}, 'is_retracted': False, 'is_paratext': False, 'primary_topic': {'id': 'https://openalex.org/T11751', 'display_name': 'Lattice Boltzmann Method for Complex Flows', 'score': 0.9991, 'subfield': {'id': 'https://openalex.org/subfields/2206', 'display_name': 'Computational Mechanics'}, 'field': {'id': 'https://openalex.org/fields/22', 'display_name': 'Engineering'}, 'domain': {'id': 'https://openalex.org/domains/3', 'display_name': 'Physical Sciences'}}, 'topics': [{'id': 'https://openalex.org/T11751', 'display_name': 'Lattice Boltzmann Method for Complex Flows', 'score': 0.9991, 'subfield': {'id': 'https://openalex.org/subfields/2206', 'display_name': 'Computational Mechanics'}, 'field': {'id': 'https://openalex.org/fields/22', 'display_name': 'Engineering'}, 'domain': {'id': 'https://openalex.org/domains/3', 'display_name': 'Physical Sciences'}}, {'id': 'https://openalex.org/T10775', 'display_name': 'Generative Adversarial Networks in Image Processing', 'score': 0.9905, 'subfield': {'id': 'https://openalex.org/subfields/1707', 'display_name': 'Computer Vision and Pattern Recognition'}, 'field': {'id': 'https://openalex.org/fields/17', 'display_name': 'Computer Science'}, 'domain': {'id': 'https://openalex.org/domains/3', 'display_name': 'Physical Sciences'}}, {'id': 'https://openalex.org/T11181', 'display_name': 'Distributed Storage Systems and Network Coding', 'score': 0.9904, 'subfield': {'id': 'https://openalex.org/subfields/1705', 'display_name': 'Computer Networks and Communications'}, 'field': {'id': 'https://openalex.org/fields/17', 'display_name': 'Computer Science'}, 'domain': {'id': 'https://openalex.org/domains/3', 'display_name': 'Physical Sciences'}}], 'keywords': [{'id': 'https://openalex.org/keywords/stencil', 'display_name': 'Stencil', 'score': 0.9721507}, {'id': 'https://openalex.org/keywords/speedup', 'display_name': 'Speedup', 'score': 0.67902386}, {'id': 'https://openalex.org/keywords/memory-bandwidth', 'display_name': 'Memory bandwidth', 'score': 0.6471944}, {'id': 'https://openalex.org/keywords/parallel-computing', 'display_name': 'Parallel Computing', 'score': 0.576837}, {'id': 'https://openalex.org/keywords/immersed-boundary-method', 'display_name': 'Immersed Boundary Method', 'score': 0.506473}, {'id': 'https://openalex.org/keywords/network-coding', 'display_name': 'Network Coding', 'score': 0.503492}, {'id': 'https://openalex.org/keywords/regenerating-codes', 'display_name': 'Regenerating Codes', 'score': 0.50035}, {'id': 'https://openalex.org/keywords/simd', 'display_name': 'SIMD', 'score': 0.42156857}], 'concepts': [{'id': 'https://openalex.org/C76752949', 'wikidata': 'https://www.wikidata.org/wiki/Q7607499', 'display_name': 'Stencil', 'level': 2, 'score': 0.9721507}, {'id': 'https://openalex.org/C173608175', 'wikidata': 'https://www.wikidata.org/wiki/Q232661', 'display_name': 'Parallel computing', 'level': 1, 'score': 0.9028677}, {'id': 'https://openalex.org/C41008148', 'wikidata': 'https://www.wikidata.org/wiki/Q21198', 'display_name': 'Computer science', 'level': 0, 'score': 0.784624}, {'id': 'https://openalex.org/C68339613', 'wikidata': 'https://www.wikidata.org/wiki/Q1549489', 'display_name': 'Speedup', 'level': 2, 'score': 0.67902386}, {'id': 'https://openalex.org/C188045654', 'wikidata': 'https://www.wikidata.org/wiki/Q17148339', 'display_name': 'Memory bandwidth', 'level': 2, 'score': 0.6471944}, {'id': 'https://openalex.org/C45374587', 'wikidata': 'https://www.wikidata.org/wiki/Q12525525', 'display_name': 'Computation', 'level': 2, 'score': 0.57996345}, {'id': 'https://openalex.org/C459310', 'wikidata': 'https://www.wikidata.org/wiki/Q117801', 'display_name': 'Computational science', 'level': 1, 'score': 0.43353808}, {'id': 'https://openalex.org/C84211073', 'wikidata': 'https://www.wikidata.org/wiki/Q117879', 'display_name': 'Floating point', 'level': 2, 'score': 0.43295076}, {'id': 'https://openalex.org/C150552126', 'wikidata': 'https://www.wikidata.org/wiki/Q339387', 'display_name': 'SIMD', 'level': 2, 'score': 0.42156857}, {'id': 'https://openalex.org/C187691185', 'wikidata': 'https://www.wikidata.org/wiki/Q2020720', 'display_name': 'Grid', 'level': 2, 'score': 0.41631967}, {'id': 'https://openalex.org/C11413529', 'wikidata': 'https://www.wikidata.org/wiki/Q8366', 'display_name': 'Algorithm', 'level': 1, 'score': 0.3217892}, {'id': 'https://openalex.org/C33923547', 'wikidata': 'https://www.wikidata.org/wiki/Q395', 'display_name': 'Mathematics', 'level': 0, 'score': 0.09785685}, {'id': 'https://openalex.org/C2524010', 'wikidata': 'https://www.wikidata.org/wiki/Q8087', 'display_name': 'Geometry', 'level': 1, 'score': 0.0}], 'mesh': [], 'locations_count': 1, 'locations': [{'is_oa': False, 'landing_page_url': 'https://doi.org/10.1109/sc.2010.2', 'pdf_url': None, 'source': None, 'license': None, 'license_id': None, 'version': None, 'is_accepted': False, 'is_published': False}], 'best_oa_location': None, 'sustainable_development_goals': [{'display_name': 'Affordable and clean energy', 'score': 0.62, 'id': 'https://metadata.un.org/sdg/7'}], 'grants': [], 'datasets': [], 'versions': [], 'referenced_works_count': 32, 'referenced_works': ['https://openalex.org/W1487278226', 'https://openalex.org/W1504320321', 'https://openalex.org/W1963695588', 'https://openalex.org/W1972537899', 'https://openalex.org/W1975059575', 'https://openalex.org/W1997147891', 'https://openalex.org/W2001738739', 'https://openalex.org/W2035701183', 'https://openalex.org/W2059710204', 'https://openalex.org/W2062364940', 'https://openalex.org/W2069784446', 'https://openalex.org/W2074129693', 'https://openalex.org/W2090267299', 'https://openalex.org/W2096369748', 'https://openalex.org/W2113190809', 'https://openalex.org/W2113754927', 'https://openalex.org/W2119042753', 'https://openalex.org/W2120833345', 'https://openalex.org/W2129050115', 'https://openalex.org/W2129471558', 'https://openalex.org/W2130498673', 'https://openalex.org/W2148038801', 'https://openalex.org/W2149668662', 'https://openalex.org/W2149788502', 'https://openalex.org/W2150319905', 'https://openalex.org/W2153092996', 'https://openalex.org/W2154786353', 'https://openalex.org/W2169150396', 'https://openalex.org/W3099022356', 'https://openalex.org/W3152199537', 'https://openalex.org/W4249968602', 'https://openalex.org/W4382891143'], 'related_works': ['https://openalex.org/W4376647684', 'https://openalex.org/W4226248541', 'https://openalex.org/W3091978438', 'https://openalex.org/W2769005600', 'https://openalex.org/W2565725308', 'https://openalex.org/W2099629705', 'https://openalex.org/W2097757554', 'https://openalex.org/W2039378765', 'https://openalex.org/W1559264847', 'https://openalex.org/W1509422975'], 'abstract_inverted_index': {'Stencil': [0], 'computation': [1], 'sweeps': [2], 'over': [3, 7], 'a': [4, 19, 59], 'spatial': [5], 'grid': [6, 72], 'multiple': [8], 'time': [9], 'steps': [10], 'to': [11, 86, 109], 'perform': [12], 'nearest-neighbor': [13], 'computations.': [14], 'The': [15, 81], 'bandwidth-to-compute': [16], 'requirement': [17], 'for': [18, 76, 129], 'large': [20], 'class': [21], 'of': [22, 47, 69, 118], 'stencil': [23, 48], 'kernels': [24, 49], 'is': [25, 31, 84, 120, 149], 'very': [26], 'high,': [27], 'and': [28, 66, 79, 90, 93, 100, 114, 124], 'their': [29], 'performance': [30, 46, 103], 'bound': [32], 'by': [33], 'the': [34, 45, 70, 97, 143], 'available': [35], 'memory': [36, 39, 75], 'bandwidth.': [37], 'Since': [38], 'bandwidth': [40], 'grows': [41], 'slower': [42], 'than': [43, 135], 'compute,': [44], 'will': [50], 'not': [51], 'scale': [52], 'with': [53, 96], 'increasing': [54], 'compute': [55], 'density.': [56], 'We': [57], 'present': [58], 'novel': [60], '3.5D-blocking': [61], 'algorithm': [62, 83], 'that': [63], 'performs': [64], '2.5D-spatial': [65], 'temporal': [67], 'blocking': [68], 'input': [71], 'into': [73], 'on-chip': [74], 'both': [77, 87], 'CPUs': [78, 113, 148], 'GPUs.': [80, 115], 'resultant': [82], 'amenable': [85], 'thread-': [88], 'level': [89], 'data-level': [91], 'parallelism,': [92], 'scales': [94], 'near-linearly': [95], 'SIMD': [98], 'width': [99], 'multiple-cores.': [101], 'Our': [102, 116], 'numbers': [104], 'are': [105], 'faster': [106, 126], 'or': [107], 'comparable': [108], 'state-of-the-art-stencil': [110], 'implementations': [111], 'on': [112, 122, 127, 147], 'implementation': [117], '7-point-stencil': [119], '1.5X-faster': [121], 'CPUs,': [123], '1.8X': [125], 'GPUs': [128], 'single-': [130], 'precision': [131], 'floating': [132], 'point': [133], 'inputs': [134], 'previously': [136], 'reported': [137], 'numbers.': [138], 'For': [139], 'Lattice': [140], 'Boltzmann': [141], 'methods,': [142], 'corresponding': [144], 'speedup': [145], 'number': [146], '2.1X.': [150]}, 'cited_by_api_url': 'https://api.openalex.org/works?filter=cites:W2039378765', 'counts_by_year': [{'year': 2024, 'cited_by_count': 9}, {'year': 2023, 'cited_by_count': 11}, {'year': 2022, 'cited_by_count': 11}, {'year': 2021, 'cited_by_count': 15}, {'year': 2020, 'cited_by_count': 21}, {'year': 2019, 'cited_by_count': 28}, {'year': 2018, 'cited_by_count': 27}, {'year': 2017, 'cited_by_count': 18}, {'year': 2016, 'cited_by_count': 25}, {'year': 2015, 'cited_by_count': 21}, {'year': 2014, 'cited_by_count': 31}, {'year': 2013, 'cited_by_count': 27}, {'year': 2012, 'cited_by_count': 28}], 'updated_date': '2024-09-19T20:14:36.318010', 'created_date': '2016-06-24'}
Publication Information

Basic Information

Access and Citation

AI Researcher Chatbot

Primary Location

Authors

Topics

Keywords

Related Works