This documentation is automatically generated by competitive-verifier/competitive-verifier
// @brief Matrix multiplication
#define PROBLEM "https://judge.yosupo.jp/problem/matrix_product"
#pragma GCC optimize("O3,unroll-loops")
#include <chrono>
#include <iostream>
#include <random>
using namespace std;
using namespace chrono;
high_resolution_clock Clock;
mt19937_64 rng(steady_clock::now().time_since_epoch().count());
uint32_t a[1024][1024], b[1024][1024];
uint64_t c[1024][1024];
int main()
{
ios::sync_with_stdio(0);
cin.tie(0);
cout.tie(0);
int n, m, k;
cin >> n >> m >> k;
for (int i = 0; i < n; i++)
for (int j = 0; j < m; j++)
cin >> a[i][j];
for (int i = 0; i < m; i++)
for (int j = 0; j < k; j++)
cin >> b[i][j];
auto t1 = Clock.now();
uint64_t modmod8 = 8ULL * 998244353 * 998244353;
#define endl '\n'
#define B 4
for (int i = 0; i < 1024; i += B)
for (int k = 0; k < 1024; k++)
{
for (int j = 0; j < 1024; j += B)
for (int n = 0; n < B; n++)
for (int m = 0; m < B; m++)
c[i + n][j + m] += (size_t)a[i + n][k] * b[k][j + m];
if (k % 8 == 0)
for (int n = 0; n < B; n++)
for (int j = 0; j < 1024; j++)
c[i + n][j] = min(c[i + n][j], c[i + n][j] - modmod8);
}
auto t2 = Clock.now();
for (int i = 0; i < n; i++)
{
for (int j = 0; j < k; j++)
cout << c[i][j] % 998244353 << " ";
cout << endl;
}
cerr << "Time: " << duration_cast<milliseconds>(t2 - t1).count() << " ms" << endl;
cerr << "GIOPS: " << (1024 * 1024 * 1024.0) / duration_cast<nanoseconds>(t2 - t1).count() << endl;
}
#line 1 "yoshi_likes_e4/matmul.test.cpp"
// @brief Matrix multiplication
#define PROBLEM "https://judge.yosupo.jp/problem/matrix_product"
#pragma GCC optimize("O3,unroll-loops")
#include <chrono>
#include <iostream>
#include <random>
using namespace std;
using namespace chrono;
high_resolution_clock Clock;
mt19937_64 rng(steady_clock::now().time_since_epoch().count());
uint32_t a[1024][1024], b[1024][1024];
uint64_t c[1024][1024];
int main()
{
ios::sync_with_stdio(0);
cin.tie(0);
cout.tie(0);
int n, m, k;
cin >> n >> m >> k;
for (int i = 0; i < n; i++)
for (int j = 0; j < m; j++)
cin >> a[i][j];
for (int i = 0; i < m; i++)
for (int j = 0; j < k; j++)
cin >> b[i][j];
auto t1 = Clock.now();
uint64_t modmod8 = 8ULL * 998244353 * 998244353;
#define endl '\n'
#define B 4
for (int i = 0; i < 1024; i += B)
for (int k = 0; k < 1024; k++)
{
for (int j = 0; j < 1024; j += B)
for (int n = 0; n < B; n++)
for (int m = 0; m < B; m++)
c[i + n][j + m] += (size_t)a[i + n][k] * b[k][j + m];
if (k % 8 == 0)
for (int n = 0; n < B; n++)
for (int j = 0; j < 1024; j++)
c[i + n][j] = min(c[i + n][j], c[i + n][j] - modmod8);
}
auto t2 = Clock.now();
for (int i = 0; i < n; i++)
{
for (int j = 0; j < k; j++)
cout << c[i][j] % 998244353 << " ";
cout << endl;
}
cerr << "Time: " << duration_cast<milliseconds>(t2 - t1).count() << " ms" << endl;
cerr << "GIOPS: " << (1024 * 1024 * 1024.0) / duration_cast<nanoseconds>(t2 - t1).count() << endl;
}
Env | Name | Status | Elapsed | Memory |
---|---|---|---|---|
g++ | example_00 | AC | 163 ms | 14 MB |
g++ | example_01 | AC | 161 ms | 15 MB |
g++ | example_02 | AC | 161 ms | 15 MB |
g++ | max_random_00 | AC | 328 ms | 20 MB |
g++ | max_random_01 | AC | 319 ms | 20 MB |
g++ | max_random_02 | AC | 329 ms | 20 MB |
g++ | random_00 | AC | 243 ms | 20 MB |
g++ | random_01 | AC | 246 ms | 19 MB |
g++ | random_02 | AC | 244 ms | 20 MB |
g++ | signed_overflow_00 | AC | 165 ms | 14 MB |
g++ | small_00 | AC | 165 ms | 15 MB |
g++ | small_01 | AC | 169 ms | 16 MB |
g++ | small_02 | AC | 161 ms | 15 MB |
g++ | small_03 | AC | 165 ms | 15 MB |
g++ | small_04 | AC | 165 ms | 15 MB |
g++ | small_05 | AC | 170 ms | 15 MB |
g++ | small_06 | AC | 167 ms | 16 MB |
g++ | small_07 | AC | 164 ms | 18 MB |
g++ | small_08 | AC | 165 ms | 16 MB |
g++ | small_09 | AC | 165 ms | 18 MB |
g++ | unsigned_overflow_00 | AC | 163 ms | 15 MB |